refactor(asl): ASL frontend architecture refactoring with left navigation
- feat: Create ASLLayout component with 7-module left navigation - feat: Implement Title Screening Settings page with optimized PICOS layout - feat: Add placeholder pages for Workbench and Results - fix: Fix nested routing structure for React Router v6 - fix: Resolve Spin component warning in MainLayout - fix: Add QueryClientProvider to App.tsx - style: Optimize PICOS form layout (P+I left, C+O+S right) - style: Align Inclusion/Exclusion criteria side-by-side - docs: Add architecture refactoring and routing fix reports Ref: Week 2 Frontend Development Scope: ASL module MVP - Title Abstract Screening
This commit is contained in:
377
backend/scripts/test-llm-screening.ts
Normal file
377
backend/scripts/test-llm-screening.ts
Normal file
@@ -0,0 +1,377 @@
|
||||
/**
|
||||
* LLM筛选质量测试脚本
|
||||
* 基于质量保障策略 v1.0.0
|
||||
* MVP目标:准确率≥85%,双模型一致率≥80%
|
||||
*/
|
||||
|
||||
import { llmScreeningService } from '../src/modules/asl/services/llmScreeningService.js';
|
||||
import { logger } from '../src/common/logging/index.js';
|
||||
import * as fs from 'fs/promises';
|
||||
import * as path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
|
||||
// 测试配置
|
||||
const TEST_CONFIG = {
|
||||
sampleFile: path.join(__dirname, 'test-samples/asl-test-literatures.json'),
|
||||
outputDir: path.join(__dirname, 'test-results'),
|
||||
models: {
|
||||
model1: 'deepseek-chat',
|
||||
model2: 'qwen-max'
|
||||
},
|
||||
concurrency: 2, // 并发数(避免API限流)
|
||||
};
|
||||
|
||||
// PICO标准(示例:SGLT2抑制剂系统综述)
|
||||
const PICO_CRITERIA = {
|
||||
population: '2型糖尿病成人患者',
|
||||
intervention: 'SGLT2抑制剂(如empagliflozin、dapagliflozin、canagliflozin等)',
|
||||
comparison: '安慰剂或常规降糖疗法',
|
||||
outcome: '心血管结局(主要不良心血管事件、心衰住院、心血管死亡)',
|
||||
studyDesign: '随机对照试验(RCT)'
|
||||
};
|
||||
|
||||
const INCLUSION_CRITERIA = `
|
||||
1. 成人2型糖尿病患者(≥18岁)
|
||||
2. 随机对照试验(RCT)设计
|
||||
3. 干预措施为SGLT2抑制剂单药或联合治疗
|
||||
4. 报告心血管结局数据
|
||||
5. 英文文献
|
||||
6. 发表于2010年后
|
||||
`;
|
||||
|
||||
const EXCLUSION_CRITERIA = `
|
||||
1. 综述、系统评价、Meta分析
|
||||
2. 病例报告、病例系列
|
||||
3. 动物实验或体外实验
|
||||
4. 会议摘要(未发表完整文章)
|
||||
5. 健康志愿者研究
|
||||
6. 1型糖尿病患者
|
||||
7. 观察性研究(队列、病例对照)
|
||||
`;
|
||||
|
||||
// 质量指标
|
||||
interface QualityMetrics {
|
||||
totalTests: number;
|
||||
correctDecisions: number;
|
||||
accuracy: number;
|
||||
consistencyRate: number;
|
||||
jsonValidRate: number;
|
||||
avgConfidence: number;
|
||||
needReviewRate: number;
|
||||
confusionMatrix: {
|
||||
truePositive: number;
|
||||
falsePositive: number;
|
||||
trueNegative: number;
|
||||
falseNegative: number;
|
||||
uncertain: number;
|
||||
};
|
||||
}
|
||||
|
||||
// 测试结果
|
||||
interface TestResult {
|
||||
literatureId: string;
|
||||
title: string;
|
||||
expectedDecision: string;
|
||||
actualDecision: string;
|
||||
isCorrect: boolean;
|
||||
hasConsensus: boolean;
|
||||
needReview: boolean;
|
||||
avgConfidence: number;
|
||||
deepseekResult: any;
|
||||
qwenResult: any;
|
||||
processingTime: number;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log('🚀 启动LLM筛选质量测试\n');
|
||||
console.log('=' .repeat(80));
|
||||
console.log('测试配置:');
|
||||
console.log(` 模型组合: ${TEST_CONFIG.models.model1} + ${TEST_CONFIG.models.model2}`);
|
||||
console.log(` PICO标准: SGLT2抑制剂 RCT 心血管结局`);
|
||||
console.log(` 质量目标: 准确率≥85%, 一致率≥80%, JSON验证≥95%`);
|
||||
console.log('=' .repeat(80) + '\n');
|
||||
|
||||
try {
|
||||
// 1. 加载测试样本
|
||||
console.log('📖 加载测试样本...');
|
||||
const samplesContent = await fs.readFile(TEST_CONFIG.sampleFile, 'utf-8');
|
||||
const samples = JSON.parse(samplesContent);
|
||||
console.log(`✅ 加载${samples.length}篇测试文献\n`);
|
||||
|
||||
// 2. 执行测试
|
||||
console.log('🧪 开始执行筛选测试...\n');
|
||||
const results: TestResult[] = [];
|
||||
|
||||
for (let i = 0; i < samples.length; i++) {
|
||||
const sample = samples[i];
|
||||
console.log(`[${i + 1}/${samples.length}] 测试文献: ${sample.id}`);
|
||||
console.log(` 标题: ${sample.title.substring(0, 80)}...`);
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
// 调用双模型筛选
|
||||
const screeningResult = await llmScreeningService.dualModelScreening(
|
||||
sample.id,
|
||||
sample.title,
|
||||
sample.abstract,
|
||||
PICO_CRITERIA,
|
||||
INCLUSION_CRITERIA,
|
||||
EXCLUSION_CRITERIA,
|
||||
[TEST_CONFIG.models.model1, TEST_CONFIG.models.model2]
|
||||
);
|
||||
|
||||
const processingTime = Date.now() - startTime;
|
||||
|
||||
// 判断结果正确性
|
||||
const actualDecision = screeningResult.finalDecision || 'pending';
|
||||
const expectedDecision = sample.expectedDecision;
|
||||
const isCorrect = actualDecision === expectedDecision;
|
||||
|
||||
// 计算平均置信度
|
||||
const avgConfidence = (
|
||||
(screeningResult.deepseek.confidence || 0) +
|
||||
(screeningResult.qwen.confidence || 0)
|
||||
) / 2;
|
||||
|
||||
const result: TestResult = {
|
||||
literatureId: sample.id,
|
||||
title: sample.title,
|
||||
expectedDecision,
|
||||
actualDecision,
|
||||
isCorrect,
|
||||
hasConsensus: !screeningResult.hasConflict,
|
||||
needReview: screeningResult.hasConflict || avgConfidence < 0.7,
|
||||
avgConfidence,
|
||||
deepseekResult: screeningResult.deepseek,
|
||||
qwenResult: screeningResult.qwen,
|
||||
processingTime,
|
||||
};
|
||||
|
||||
results.push(result);
|
||||
|
||||
console.log(` ${isCorrect ? '✅' : '❌'} 期望: ${expectedDecision}, 实际: ${actualDecision}`);
|
||||
console.log(` 一致性: ${screeningResult.hasConflict ? '❌ 冲突' : '✅ 一致'}`);
|
||||
console.log(` 置信度: ${avgConfidence.toFixed(2)}`);
|
||||
console.log(` 耗时: ${processingTime}ms`);
|
||||
console.log('');
|
||||
|
||||
// 避免API限流
|
||||
if (i < samples.length - 1) {
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error(` ❌ 测试失败:`, error);
|
||||
results.push({
|
||||
literatureId: sample.id,
|
||||
title: sample.title,
|
||||
expectedDecision: sample.expectedDecision,
|
||||
actualDecision: 'error',
|
||||
isCorrect: false,
|
||||
hasConsensus: false,
|
||||
needReview: true,
|
||||
avgConfidence: 0,
|
||||
deepseekResult: null,
|
||||
qwenResult: null,
|
||||
processingTime: Date.now() - startTime,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// 3. 计算质量指标
|
||||
console.log('\n' + '='.repeat(80));
|
||||
console.log('📊 质量指标统计\n');
|
||||
|
||||
const metrics = calculateMetrics(results);
|
||||
|
||||
console.log(`总测试数: ${metrics.totalTests}`);
|
||||
console.log(`正确决策: ${metrics.correctDecisions}`);
|
||||
console.log(`准确率: ${(metrics.accuracy * 100).toFixed(1)}% ${metrics.accuracy >= 0.85 ? '✅' : '❌'} (目标≥85%)`);
|
||||
console.log(`一致率: ${(metrics.consistencyRate * 100).toFixed(1)}% ${metrics.consistencyRate >= 0.80 ? '✅' : '❌'} (目标≥80%)`);
|
||||
console.log(`平均置信度: ${metrics.avgConfidence.toFixed(2)}`);
|
||||
console.log(`需人工复核: ${(metrics.needReviewRate * 100).toFixed(1)}% ${metrics.needReviewRate <= 0.20 ? '✅' : '❌'} (目标≤20%)`);
|
||||
console.log('\n混淆矩阵:');
|
||||
console.log(` 真阳性(TP): ${metrics.confusionMatrix.truePositive}`);
|
||||
console.log(` 假阳性(FP): ${metrics.confusionMatrix.falsePositive}`);
|
||||
console.log(` 真阴性(TN): ${metrics.confusionMatrix.trueNegative}`);
|
||||
console.log(` 假阴性(FN): ${metrics.confusionMatrix.falseNegative}`);
|
||||
console.log(` 不确定: ${metrics.confusionMatrix.uncertain}`);
|
||||
|
||||
// 4. 保存结果
|
||||
console.log('\n💾 保存测试结果...');
|
||||
await fs.mkdir(TEST_CONFIG.outputDir, { recursive: true });
|
||||
|
||||
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
||||
const outputFile = path.join(
|
||||
TEST_CONFIG.outputDir,
|
||||
`test-results-${timestamp}.json`
|
||||
);
|
||||
|
||||
await fs.writeFile(
|
||||
outputFile,
|
||||
JSON.stringify({ metrics, results }, null, 2),
|
||||
'utf-8'
|
||||
);
|
||||
|
||||
console.log(`✅ 结果已保存: ${outputFile}`);
|
||||
|
||||
// 5. 生成报告
|
||||
console.log('\n📋 生成测试报告...');
|
||||
const report = generateReport(metrics, results);
|
||||
const reportFile = path.join(
|
||||
TEST_CONFIG.outputDir,
|
||||
`test-report-${timestamp}.md`
|
||||
);
|
||||
|
||||
await fs.writeFile(reportFile, report, 'utf-8');
|
||||
console.log(`✅ 报告已生成: ${reportFile}`);
|
||||
|
||||
// 6. 总结
|
||||
console.log('\n' + '='.repeat(80));
|
||||
console.log('🎯 测试总结\n');
|
||||
|
||||
const allPassed =
|
||||
metrics.accuracy >= 0.85 &&
|
||||
metrics.consistencyRate >= 0.80 &&
|
||||
metrics.needReviewRate <= 0.20;
|
||||
|
||||
if (allPassed) {
|
||||
console.log('✅ 所有质量指标达标!MVP阶段质量要求满足。');
|
||||
} else {
|
||||
console.log('❌ 部分质量指标未达标,需要优化Prompt或调整策略。');
|
||||
console.log('\n改进建议:');
|
||||
if (metrics.accuracy < 0.85) {
|
||||
console.log(' - 优化Prompt,增加示例和指导');
|
||||
console.log(' - 检查错误案例,找出共性问题');
|
||||
}
|
||||
if (metrics.consistencyRate < 0.80) {
|
||||
console.log(' - 提高Prompt的明确性和一致性');
|
||||
console.log(' - 考虑增加Few-shot示例');
|
||||
}
|
||||
if (metrics.needReviewRate > 0.20) {
|
||||
console.log(' - 优化置信度评分策略');
|
||||
console.log(' - 调整人工复核阈值');
|
||||
}
|
||||
}
|
||||
|
||||
console.log('='.repeat(80));
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ 测试失败:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
function calculateMetrics(results: TestResult[]): QualityMetrics {
|
||||
const totalTests = results.length;
|
||||
const correctDecisions = results.filter(r => r.isCorrect).length;
|
||||
const accuracy = totalTests > 0 ? correctDecisions / totalTests : 0;
|
||||
|
||||
const consensusCount = results.filter(r => r.hasConsensus).length;
|
||||
const consistencyRate = totalTests > 0 ? consensusCount / totalTests : 0;
|
||||
|
||||
const totalConfidence = results.reduce((sum, r) => sum + r.avgConfidence, 0);
|
||||
const avgConfidence = totalTests > 0 ? totalConfidence / totalTests : 0;
|
||||
|
||||
const needReviewCount = results.filter(r => r.needReview).length;
|
||||
const needReviewRate = totalTests > 0 ? needReviewCount / totalTests : 0;
|
||||
|
||||
// 混淆矩阵
|
||||
const confusionMatrix = {
|
||||
truePositive: 0,
|
||||
falsePositive: 0,
|
||||
trueNegative: 0,
|
||||
falseNegative: 0,
|
||||
uncertain: 0,
|
||||
};
|
||||
|
||||
results.forEach(r => {
|
||||
if (r.actualDecision === 'uncertain') {
|
||||
confusionMatrix.uncertain++;
|
||||
} else if (r.expectedDecision === 'include' && r.actualDecision === 'include') {
|
||||
confusionMatrix.truePositive++;
|
||||
} else if (r.expectedDecision === 'exclude' && r.actualDecision === 'include') {
|
||||
confusionMatrix.falsePositive++;
|
||||
} else if (r.expectedDecision === 'exclude' && r.actualDecision === 'exclude') {
|
||||
confusionMatrix.trueNegative++;
|
||||
} else if (r.expectedDecision === 'include' && r.actualDecision === 'exclude') {
|
||||
confusionMatrix.falseNegative++;
|
||||
}
|
||||
});
|
||||
|
||||
return {
|
||||
totalTests,
|
||||
correctDecisions,
|
||||
accuracy,
|
||||
consistencyRate,
|
||||
jsonValidRate: 1.0, // 由AJV自动验证
|
||||
avgConfidence,
|
||||
needReviewRate,
|
||||
confusionMatrix,
|
||||
};
|
||||
}
|
||||
|
||||
function generateReport(metrics: QualityMetrics, results: TestResult[]): string {
|
||||
return `# LLM筛选质量测试报告
|
||||
|
||||
**测试时间**: ${new Date().toISOString()}
|
||||
**测试模型**: ${TEST_CONFIG.models.model1} + ${TEST_CONFIG.models.model2}
|
||||
**测试样本数**: ${metrics.totalTests}
|
||||
|
||||
---
|
||||
|
||||
## 质量指标
|
||||
|
||||
| 指标 | 实际值 | 目标值 | 状态 |
|
||||
|------|--------|--------|------|
|
||||
| 准确率 | ${(metrics.accuracy * 100).toFixed(1)}% | ≥85% | ${metrics.accuracy >= 0.85 ? '✅' : '❌'} |
|
||||
| 一致率 | ${(metrics.consistencyRate * 100).toFixed(1)}% | ≥80% | ${metrics.consistencyRate >= 0.80 ? '✅' : '❌'} |
|
||||
| 平均置信度 | ${metrics.avgConfidence.toFixed(2)} | - | - |
|
||||
| 需人工复核率 | ${(metrics.needReviewRate * 100).toFixed(1)}% | ≤20% | ${metrics.needReviewRate <= 0.20 ? '✅' : '❌'} |
|
||||
|
||||
---
|
||||
|
||||
## 混淆矩阵
|
||||
|
||||
\`\`\`
|
||||
预测纳入 预测排除 不确定
|
||||
实际纳入 ${metrics.confusionMatrix.truePositive} ${metrics.confusionMatrix.falseNegative} -
|
||||
实际排除 ${metrics.confusionMatrix.falsePositive} ${metrics.confusionMatrix.trueNegative} -
|
||||
不确定 - - ${metrics.confusionMatrix.uncertain}
|
||||
\`\`\`
|
||||
|
||||
---
|
||||
|
||||
## 详细结果
|
||||
|
||||
${results.map((r, i) => `
|
||||
### ${i + 1}. ${r.literatureId}
|
||||
|
||||
**标题**: ${r.title}
|
||||
**期望决策**: ${r.expectedDecision}
|
||||
**实际决策**: ${r.actualDecision}
|
||||
**结果**: ${r.isCorrect ? '✅ 正确' : '❌ 错误'}
|
||||
**一致性**: ${r.hasConsensus ? '✅ 一致' : '❌ 冲突'}
|
||||
**平均置信度**: ${r.avgConfidence.toFixed(2)}
|
||||
**处理时间**: ${r.processingTime}ms
|
||||
**需人工复核**: ${r.needReview ? '是' : '否'}
|
||||
|
||||
**DeepSeek结论**: ${r.deepseekResult?.conclusion} (置信度: ${r.deepseekResult?.confidence?.toFixed(2)})
|
||||
**Qwen结论**: ${r.qwenResult?.conclusion} (置信度: ${r.qwenResult?.confidence?.toFixed(2)})
|
||||
`).join('\n')}
|
||||
|
||||
---
|
||||
|
||||
**生成时间**: ${new Date().toISOString()}
|
||||
`;
|
||||
}
|
||||
|
||||
// 运行测试
|
||||
main().catch(console.error);
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user