Files
AIclinicalresearch/backend/scripts/test-llm-screening.ts
HaHafeng 3634933ece refactor(asl): ASL frontend architecture refactoring with left navigation
- feat: Create ASLLayout component with 7-module left navigation
- feat: Implement Title Screening Settings page with optimized PICOS layout
- feat: Add placeholder pages for Workbench and Results
- fix: Fix nested routing structure for React Router v6
- fix: Resolve Spin component warning in MainLayout
- fix: Add QueryClientProvider to App.tsx
- style: Optimize PICOS form layout (P+I left, C+O+S right)
- style: Align Inclusion/Exclusion criteria side-by-side
- docs: Add architecture refactoring and routing fix reports

Ref: Week 2 Frontend Development
Scope: ASL module MVP - Title Abstract Screening
2025-11-18 21:51:51 +08:00

378 lines
12 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* LLM筛选质量测试脚本
* 基于质量保障策略 v1.0.0
* MVP目标准确率≥85%双模型一致率≥80%
*/
import { llmScreeningService } from '../src/modules/asl/services/llmScreeningService.js';
import { logger } from '../src/common/logging/index.js';
import * as fs from 'fs/promises';
import * as path from 'path';
import { fileURLToPath } from 'url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// 测试配置
const TEST_CONFIG = {
sampleFile: path.join(__dirname, 'test-samples/asl-test-literatures.json'),
outputDir: path.join(__dirname, 'test-results'),
models: {
model1: 'deepseek-chat',
model2: 'qwen-max'
},
concurrency: 2, // 并发数避免API限流
};
// PICO标准示例SGLT2抑制剂系统综述
const PICO_CRITERIA = {
population: '2型糖尿病成人患者',
intervention: 'SGLT2抑制剂如empagliflozin、dapagliflozin、canagliflozin等',
comparison: '安慰剂或常规降糖疗法',
outcome: '心血管结局(主要不良心血管事件、心衰住院、心血管死亡)',
studyDesign: '随机对照试验RCT'
};
const INCLUSION_CRITERIA = `
1. 成人2型糖尿病患者≥18岁
2. 随机对照试验RCT设计
3. 干预措施为SGLT2抑制剂单药或联合治疗
4. 报告心血管结局数据
5. 英文文献
6. 发表于2010年后
`;
const EXCLUSION_CRITERIA = `
1. 综述、系统评价、Meta分析
2. 病例报告、病例系列
3. 动物实验或体外实验
4. 会议摘要(未发表完整文章)
5. 健康志愿者研究
6. 1型糖尿病患者
7. 观察性研究(队列、病例对照)
`;
// 质量指标
interface QualityMetrics {
totalTests: number;
correctDecisions: number;
accuracy: number;
consistencyRate: number;
jsonValidRate: number;
avgConfidence: number;
needReviewRate: number;
confusionMatrix: {
truePositive: number;
falsePositive: number;
trueNegative: number;
falseNegative: number;
uncertain: number;
};
}
// 测试结果
interface TestResult {
literatureId: string;
title: string;
expectedDecision: string;
actualDecision: string;
isCorrect: boolean;
hasConsensus: boolean;
needReview: boolean;
avgConfidence: number;
deepseekResult: any;
qwenResult: any;
processingTime: number;
}
async function main() {
console.log('🚀 启动LLM筛选质量测试\n');
console.log('=' .repeat(80));
console.log('测试配置:');
console.log(` 模型组合: ${TEST_CONFIG.models.model1} + ${TEST_CONFIG.models.model2}`);
console.log(` PICO标准: SGLT2抑制剂 RCT 心血管结局`);
console.log(` 质量目标: 准确率≥85%, 一致率≥80%, JSON验证≥95%`);
console.log('=' .repeat(80) + '\n');
try {
// 1. 加载测试样本
console.log('📖 加载测试样本...');
const samplesContent = await fs.readFile(TEST_CONFIG.sampleFile, 'utf-8');
const samples = JSON.parse(samplesContent);
console.log(`✅ 加载${samples.length}篇测试文献\n`);
// 2. 执行测试
console.log('🧪 开始执行筛选测试...\n');
const results: TestResult[] = [];
for (let i = 0; i < samples.length; i++) {
const sample = samples[i];
console.log(`[${i + 1}/${samples.length}] 测试文献: ${sample.id}`);
console.log(` 标题: ${sample.title.substring(0, 80)}...`);
const startTime = Date.now();
try {
// 调用双模型筛选
const screeningResult = await llmScreeningService.dualModelScreening(
sample.id,
sample.title,
sample.abstract,
PICO_CRITERIA,
INCLUSION_CRITERIA,
EXCLUSION_CRITERIA,
[TEST_CONFIG.models.model1, TEST_CONFIG.models.model2]
);
const processingTime = Date.now() - startTime;
// 判断结果正确性
const actualDecision = screeningResult.finalDecision || 'pending';
const expectedDecision = sample.expectedDecision;
const isCorrect = actualDecision === expectedDecision;
// 计算平均置信度
const avgConfidence = (
(screeningResult.deepseek.confidence || 0) +
(screeningResult.qwen.confidence || 0)
) / 2;
const result: TestResult = {
literatureId: sample.id,
title: sample.title,
expectedDecision,
actualDecision,
isCorrect,
hasConsensus: !screeningResult.hasConflict,
needReview: screeningResult.hasConflict || avgConfidence < 0.7,
avgConfidence,
deepseekResult: screeningResult.deepseek,
qwenResult: screeningResult.qwen,
processingTime,
};
results.push(result);
console.log(` ${isCorrect ? '✅' : '❌'} 期望: ${expectedDecision}, 实际: ${actualDecision}`);
console.log(` 一致性: ${screeningResult.hasConflict ? '❌ 冲突' : '✅ 一致'}`);
console.log(` 置信度: ${avgConfidence.toFixed(2)}`);
console.log(` 耗时: ${processingTime}ms`);
console.log('');
// 避免API限流
if (i < samples.length - 1) {
await new Promise(resolve => setTimeout(resolve, 1000));
}
} catch (error) {
console.error(` ❌ 测试失败:`, error);
results.push({
literatureId: sample.id,
title: sample.title,
expectedDecision: sample.expectedDecision,
actualDecision: 'error',
isCorrect: false,
hasConsensus: false,
needReview: true,
avgConfidence: 0,
deepseekResult: null,
qwenResult: null,
processingTime: Date.now() - startTime,
});
}
}
// 3. 计算质量指标
console.log('\n' + '='.repeat(80));
console.log('📊 质量指标统计\n');
const metrics = calculateMetrics(results);
console.log(`总测试数: ${metrics.totalTests}`);
console.log(`正确决策: ${metrics.correctDecisions}`);
console.log(`准确率: ${(metrics.accuracy * 100).toFixed(1)}% ${metrics.accuracy >= 0.85 ? '✅' : '❌'} (目标≥85%)`);
console.log(`一致率: ${(metrics.consistencyRate * 100).toFixed(1)}% ${metrics.consistencyRate >= 0.80 ? '✅' : '❌'} (目标≥80%)`);
console.log(`平均置信度: ${metrics.avgConfidence.toFixed(2)}`);
console.log(`需人工复核: ${(metrics.needReviewRate * 100).toFixed(1)}% ${metrics.needReviewRate <= 0.20 ? '✅' : '❌'} (目标≤20%)`);
console.log('\n混淆矩阵:');
console.log(` 真阳性(TP): ${metrics.confusionMatrix.truePositive}`);
console.log(` 假阳性(FP): ${metrics.confusionMatrix.falsePositive}`);
console.log(` 真阴性(TN): ${metrics.confusionMatrix.trueNegative}`);
console.log(` 假阴性(FN): ${metrics.confusionMatrix.falseNegative}`);
console.log(` 不确定: ${metrics.confusionMatrix.uncertain}`);
// 4. 保存结果
console.log('\n💾 保存测试结果...');
await fs.mkdir(TEST_CONFIG.outputDir, { recursive: true });
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
const outputFile = path.join(
TEST_CONFIG.outputDir,
`test-results-${timestamp}.json`
);
await fs.writeFile(
outputFile,
JSON.stringify({ metrics, results }, null, 2),
'utf-8'
);
console.log(`✅ 结果已保存: ${outputFile}`);
// 5. 生成报告
console.log('\n📋 生成测试报告...');
const report = generateReport(metrics, results);
const reportFile = path.join(
TEST_CONFIG.outputDir,
`test-report-${timestamp}.md`
);
await fs.writeFile(reportFile, report, 'utf-8');
console.log(`✅ 报告已生成: ${reportFile}`);
// 6. 总结
console.log('\n' + '='.repeat(80));
console.log('🎯 测试总结\n');
const allPassed =
metrics.accuracy >= 0.85 &&
metrics.consistencyRate >= 0.80 &&
metrics.needReviewRate <= 0.20;
if (allPassed) {
console.log('✅ 所有质量指标达标MVP阶段质量要求满足。');
} else {
console.log('❌ 部分质量指标未达标需要优化Prompt或调整策略。');
console.log('\n改进建议');
if (metrics.accuracy < 0.85) {
console.log(' - 优化Prompt增加示例和指导');
console.log(' - 检查错误案例,找出共性问题');
}
if (metrics.consistencyRate < 0.80) {
console.log(' - 提高Prompt的明确性和一致性');
console.log(' - 考虑增加Few-shot示例');
}
if (metrics.needReviewRate > 0.20) {
console.log(' - 优化置信度评分策略');
console.log(' - 调整人工复核阈值');
}
}
console.log('='.repeat(80));
} catch (error) {
console.error('❌ 测试失败:', error);
process.exit(1);
}
}
function calculateMetrics(results: TestResult[]): QualityMetrics {
const totalTests = results.length;
const correctDecisions = results.filter(r => r.isCorrect).length;
const accuracy = totalTests > 0 ? correctDecisions / totalTests : 0;
const consensusCount = results.filter(r => r.hasConsensus).length;
const consistencyRate = totalTests > 0 ? consensusCount / totalTests : 0;
const totalConfidence = results.reduce((sum, r) => sum + r.avgConfidence, 0);
const avgConfidence = totalTests > 0 ? totalConfidence / totalTests : 0;
const needReviewCount = results.filter(r => r.needReview).length;
const needReviewRate = totalTests > 0 ? needReviewCount / totalTests : 0;
// 混淆矩阵
const confusionMatrix = {
truePositive: 0,
falsePositive: 0,
trueNegative: 0,
falseNegative: 0,
uncertain: 0,
};
results.forEach(r => {
if (r.actualDecision === 'uncertain') {
confusionMatrix.uncertain++;
} else if (r.expectedDecision === 'include' && r.actualDecision === 'include') {
confusionMatrix.truePositive++;
} else if (r.expectedDecision === 'exclude' && r.actualDecision === 'include') {
confusionMatrix.falsePositive++;
} else if (r.expectedDecision === 'exclude' && r.actualDecision === 'exclude') {
confusionMatrix.trueNegative++;
} else if (r.expectedDecision === 'include' && r.actualDecision === 'exclude') {
confusionMatrix.falseNegative++;
}
});
return {
totalTests,
correctDecisions,
accuracy,
consistencyRate,
jsonValidRate: 1.0, // 由AJV自动验证
avgConfidence,
needReviewRate,
confusionMatrix,
};
}
function generateReport(metrics: QualityMetrics, results: TestResult[]): string {
return `# LLM筛选质量测试报告
**测试时间**: ${new Date().toISOString()}
**测试模型**: ${TEST_CONFIG.models.model1} + ${TEST_CONFIG.models.model2}
**测试样本数**: ${metrics.totalTests}
---
## 质量指标
| 指标 | 实际值 | 目标值 | 状态 |
|------|--------|--------|------|
| 准确率 | ${(metrics.accuracy * 100).toFixed(1)}% | ≥85% | ${metrics.accuracy >= 0.85 ? '✅' : '❌'} |
| 一致率 | ${(metrics.consistencyRate * 100).toFixed(1)}% | ≥80% | ${metrics.consistencyRate >= 0.80 ? '✅' : '❌'} |
| 平均置信度 | ${metrics.avgConfidence.toFixed(2)} | - | - |
| 需人工复核率 | ${(metrics.needReviewRate * 100).toFixed(1)}% | ≤20% | ${metrics.needReviewRate <= 0.20 ? '✅' : '❌'} |
---
## 混淆矩阵
\`\`\`
预测纳入 预测排除 不确定
实际纳入 ${metrics.confusionMatrix.truePositive} ${metrics.confusionMatrix.falseNegative} -
实际排除 ${metrics.confusionMatrix.falsePositive} ${metrics.confusionMatrix.trueNegative} -
不确定 - - ${metrics.confusionMatrix.uncertain}
\`\`\`
---
## 详细结果
${results.map((r, i) => `
### ${i + 1}. ${r.literatureId}
**标题**: ${r.title}
**期望决策**: ${r.expectedDecision}
**实际决策**: ${r.actualDecision}
**结果**: ${r.isCorrect ? '✅ 正确' : '❌ 错误'}
**一致性**: ${r.hasConsensus ? '✅ 一致' : '❌ 冲突'}
**平均置信度**: ${r.avgConfidence.toFixed(2)}
**处理时间**: ${r.processingTime}ms
**需人工复核**: ${r.needReview ? '是' : '否'}
**DeepSeek结论**: ${r.deepseekResult?.conclusion} (置信度: ${r.deepseekResult?.confidence?.toFixed(2)})
**Qwen结论**: ${r.qwenResult?.conclusion} (置信度: ${r.qwenResult?.confidence?.toFixed(2)})
`).join('\n')}
---
**生成时间**: ${new Date().toISOString()}
`;
}
// 运行测试
main().catch(console.error);