/** * LLM筛选质量测试脚本 * 基于质量保障策略 v1.0.0 * MVP目标:准确率≥85%,双模型一致率≥80% */ import { llmScreeningService } from '../src/modules/asl/services/llmScreeningService.js'; import { logger } from '../src/common/logging/index.js'; import * as fs from 'fs/promises'; import * as path from 'path'; import { fileURLToPath } from 'url'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); // 测试配置 const TEST_CONFIG = { sampleFile: path.join(__dirname, 'test-samples/asl-test-literatures.json'), outputDir: path.join(__dirname, 'test-results'), models: { model1: 'deepseek-chat', model2: 'qwen-max' }, concurrency: 2, // 并发数(避免API限流) }; // PICO标准(示例:SGLT2抑制剂系统综述) const PICO_CRITERIA = { population: '2型糖尿病成人患者', intervention: 'SGLT2抑制剂(如empagliflozin、dapagliflozin、canagliflozin等)', comparison: '安慰剂或常规降糖疗法', outcome: '心血管结局(主要不良心血管事件、心衰住院、心血管死亡)', studyDesign: '随机对照试验(RCT)' }; const INCLUSION_CRITERIA = ` 1. 成人2型糖尿病患者(≥18岁) 2. 随机对照试验(RCT)设计 3. 干预措施为SGLT2抑制剂单药或联合治疗 4. 报告心血管结局数据 5. 英文文献 6. 发表于2010年后 `; const EXCLUSION_CRITERIA = ` 1. 综述、系统评价、Meta分析 2. 病例报告、病例系列 3. 动物实验或体外实验 4. 会议摘要(未发表完整文章) 5. 健康志愿者研究 6. 1型糖尿病患者 7. 观察性研究(队列、病例对照) `; // 质量指标 interface QualityMetrics { totalTests: number; correctDecisions: number; accuracy: number; consistencyRate: number; jsonValidRate: number; avgConfidence: number; needReviewRate: number; confusionMatrix: { truePositive: number; falsePositive: number; trueNegative: number; falseNegative: number; uncertain: number; }; } // 测试结果 interface TestResult { literatureId: string; title: string; expectedDecision: string; actualDecision: string; isCorrect: boolean; hasConsensus: boolean; needReview: boolean; avgConfidence: number; deepseekResult: any; qwenResult: any; processingTime: number; } async function main() { console.log('🚀 启动LLM筛选质量测试\n'); console.log('=' .repeat(80)); console.log('测试配置:'); console.log(` 模型组合: ${TEST_CONFIG.models.model1} + ${TEST_CONFIG.models.model2}`); console.log(` PICO标准: SGLT2抑制剂 RCT 心血管结局`); console.log(` 质量目标: 准确率≥85%, 一致率≥80%, JSON验证≥95%`); console.log('=' .repeat(80) + '\n'); try { // 1. 加载测试样本 console.log('📖 加载测试样本...'); const samplesContent = await fs.readFile(TEST_CONFIG.sampleFile, 'utf-8'); const samples = JSON.parse(samplesContent); console.log(`✅ 加载${samples.length}篇测试文献\n`); // 2. 执行测试 console.log('🧪 开始执行筛选测试...\n'); const results: TestResult[] = []; for (let i = 0; i < samples.length; i++) { const sample = samples[i]; console.log(`[${i + 1}/${samples.length}] 测试文献: ${sample.id}`); console.log(` 标题: ${sample.title.substring(0, 80)}...`); const startTime = Date.now(); try { // 调用双模型筛选 const screeningResult = await llmScreeningService.dualModelScreening( sample.id, sample.title, sample.abstract, PICO_CRITERIA, INCLUSION_CRITERIA, EXCLUSION_CRITERIA, [TEST_CONFIG.models.model1, TEST_CONFIG.models.model2] ); const processingTime = Date.now() - startTime; // 判断结果正确性 const actualDecision = screeningResult.finalDecision || 'pending'; const expectedDecision = sample.expectedDecision; const isCorrect = actualDecision === expectedDecision; // 计算平均置信度 const avgConfidence = ( (screeningResult.deepseek.confidence || 0) + (screeningResult.qwen.confidence || 0) ) / 2; const result: TestResult = { literatureId: sample.id, title: sample.title, expectedDecision, actualDecision, isCorrect, hasConsensus: !screeningResult.hasConflict, needReview: screeningResult.hasConflict || avgConfidence < 0.7, avgConfidence, deepseekResult: screeningResult.deepseek, qwenResult: screeningResult.qwen, processingTime, }; results.push(result); console.log(` ${isCorrect ? '✅' : '❌'} 期望: ${expectedDecision}, 实际: ${actualDecision}`); console.log(` 一致性: ${screeningResult.hasConflict ? '❌ 冲突' : '✅ 一致'}`); console.log(` 置信度: ${avgConfidence.toFixed(2)}`); console.log(` 耗时: ${processingTime}ms`); console.log(''); // 避免API限流 if (i < samples.length - 1) { await new Promise(resolve => setTimeout(resolve, 1000)); } } catch (error) { console.error(` ❌ 测试失败:`, error); results.push({ literatureId: sample.id, title: sample.title, expectedDecision: sample.expectedDecision, actualDecision: 'error', isCorrect: false, hasConsensus: false, needReview: true, avgConfidence: 0, deepseekResult: null, qwenResult: null, processingTime: Date.now() - startTime, }); } } // 3. 计算质量指标 console.log('\n' + '='.repeat(80)); console.log('📊 质量指标统计\n'); const metrics = calculateMetrics(results); console.log(`总测试数: ${metrics.totalTests}`); console.log(`正确决策: ${metrics.correctDecisions}`); console.log(`准确率: ${(metrics.accuracy * 100).toFixed(1)}% ${metrics.accuracy >= 0.85 ? '✅' : '❌'} (目标≥85%)`); console.log(`一致率: ${(metrics.consistencyRate * 100).toFixed(1)}% ${metrics.consistencyRate >= 0.80 ? '✅' : '❌'} (目标≥80%)`); console.log(`平均置信度: ${metrics.avgConfidence.toFixed(2)}`); console.log(`需人工复核: ${(metrics.needReviewRate * 100).toFixed(1)}% ${metrics.needReviewRate <= 0.20 ? '✅' : '❌'} (目标≤20%)`); console.log('\n混淆矩阵:'); console.log(` 真阳性(TP): ${metrics.confusionMatrix.truePositive}`); console.log(` 假阳性(FP): ${metrics.confusionMatrix.falsePositive}`); console.log(` 真阴性(TN): ${metrics.confusionMatrix.trueNegative}`); console.log(` 假阴性(FN): ${metrics.confusionMatrix.falseNegative}`); console.log(` 不确定: ${metrics.confusionMatrix.uncertain}`); // 4. 保存结果 console.log('\n💾 保存测试结果...'); await fs.mkdir(TEST_CONFIG.outputDir, { recursive: true }); const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); const outputFile = path.join( TEST_CONFIG.outputDir, `test-results-${timestamp}.json` ); await fs.writeFile( outputFile, JSON.stringify({ metrics, results }, null, 2), 'utf-8' ); console.log(`✅ 结果已保存: ${outputFile}`); // 5. 生成报告 console.log('\n📋 生成测试报告...'); const report = generateReport(metrics, results); const reportFile = path.join( TEST_CONFIG.outputDir, `test-report-${timestamp}.md` ); await fs.writeFile(reportFile, report, 'utf-8'); console.log(`✅ 报告已生成: ${reportFile}`); // 6. 总结 console.log('\n' + '='.repeat(80)); console.log('🎯 测试总结\n'); const allPassed = metrics.accuracy >= 0.85 && metrics.consistencyRate >= 0.80 && metrics.needReviewRate <= 0.20; if (allPassed) { console.log('✅ 所有质量指标达标!MVP阶段质量要求满足。'); } else { console.log('❌ 部分质量指标未达标,需要优化Prompt或调整策略。'); console.log('\n改进建议:'); if (metrics.accuracy < 0.85) { console.log(' - 优化Prompt,增加示例和指导'); console.log(' - 检查错误案例,找出共性问题'); } if (metrics.consistencyRate < 0.80) { console.log(' - 提高Prompt的明确性和一致性'); console.log(' - 考虑增加Few-shot示例'); } if (metrics.needReviewRate > 0.20) { console.log(' - 优化置信度评分策略'); console.log(' - 调整人工复核阈值'); } } console.log('='.repeat(80)); } catch (error) { console.error('❌ 测试失败:', error); process.exit(1); } } function calculateMetrics(results: TestResult[]): QualityMetrics { const totalTests = results.length; const correctDecisions = results.filter(r => r.isCorrect).length; const accuracy = totalTests > 0 ? correctDecisions / totalTests : 0; const consensusCount = results.filter(r => r.hasConsensus).length; const consistencyRate = totalTests > 0 ? consensusCount / totalTests : 0; const totalConfidence = results.reduce((sum, r) => sum + r.avgConfidence, 0); const avgConfidence = totalTests > 0 ? totalConfidence / totalTests : 0; const needReviewCount = results.filter(r => r.needReview).length; const needReviewRate = totalTests > 0 ? needReviewCount / totalTests : 0; // 混淆矩阵 const confusionMatrix = { truePositive: 0, falsePositive: 0, trueNegative: 0, falseNegative: 0, uncertain: 0, }; results.forEach(r => { if (r.actualDecision === 'uncertain') { confusionMatrix.uncertain++; } else if (r.expectedDecision === 'include' && r.actualDecision === 'include') { confusionMatrix.truePositive++; } else if (r.expectedDecision === 'exclude' && r.actualDecision === 'include') { confusionMatrix.falsePositive++; } else if (r.expectedDecision === 'exclude' && r.actualDecision === 'exclude') { confusionMatrix.trueNegative++; } else if (r.expectedDecision === 'include' && r.actualDecision === 'exclude') { confusionMatrix.falseNegative++; } }); return { totalTests, correctDecisions, accuracy, consistencyRate, jsonValidRate: 1.0, // 由AJV自动验证 avgConfidence, needReviewRate, confusionMatrix, }; } function generateReport(metrics: QualityMetrics, results: TestResult[]): string { return `# LLM筛选质量测试报告 **测试时间**: ${new Date().toISOString()} **测试模型**: ${TEST_CONFIG.models.model1} + ${TEST_CONFIG.models.model2} **测试样本数**: ${metrics.totalTests} --- ## 质量指标 | 指标 | 实际值 | 目标值 | 状态 | |------|--------|--------|------| | 准确率 | ${(metrics.accuracy * 100).toFixed(1)}% | ≥85% | ${metrics.accuracy >= 0.85 ? '✅' : '❌'} | | 一致率 | ${(metrics.consistencyRate * 100).toFixed(1)}% | ≥80% | ${metrics.consistencyRate >= 0.80 ? '✅' : '❌'} | | 平均置信度 | ${metrics.avgConfidence.toFixed(2)} | - | - | | 需人工复核率 | ${(metrics.needReviewRate * 100).toFixed(1)}% | ≤20% | ${metrics.needReviewRate <= 0.20 ? '✅' : '❌'} | --- ## 混淆矩阵 \`\`\` 预测纳入 预测排除 不确定 实际纳入 ${metrics.confusionMatrix.truePositive} ${metrics.confusionMatrix.falseNegative} - 实际排除 ${metrics.confusionMatrix.falsePositive} ${metrics.confusionMatrix.trueNegative} - 不确定 - - ${metrics.confusionMatrix.uncertain} \`\`\` --- ## 详细结果 ${results.map((r, i) => ` ### ${i + 1}. ${r.literatureId} **标题**: ${r.title} **期望决策**: ${r.expectedDecision} **实际决策**: ${r.actualDecision} **结果**: ${r.isCorrect ? '✅ 正确' : '❌ 错误'} **一致性**: ${r.hasConsensus ? '✅ 一致' : '❌ 冲突'} **平均置信度**: ${r.avgConfidence.toFixed(2)} **处理时间**: ${r.processingTime}ms **需人工复核**: ${r.needReview ? '是' : '否'} **DeepSeek结论**: ${r.deepseekResult?.conclusion} (置信度: ${r.deepseekResult?.confidence?.toFixed(2)}) **Qwen结论**: ${r.qwenResult?.conclusion} (置信度: ${r.qwenResult?.confidence?.toFixed(2)}) `).join('\n')} --- **生成时间**: ${new Date().toISOString()} `; } // 运行测试 main().catch(console.error);