/** * 卒中数据测试 - 宽松模式 * * 测试目的:验证宽松Prompt是否能提高初筛准确率 * * 策略: * - 宁可多纳入,也不要错过 * - 只排除明显不符合的 * - 边界情况倾向于纳入 */ import * as fs from 'fs'; import * as path from 'path'; import * as XLSX from 'xlsx'; import { fileURLToPath } from 'url'; import { llmScreeningService } from '../src/modules/asl/services/llmScreeningService.js'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); // 读取PICOS const picoCriteria = { population: '非心源性缺血性卒中患者、亚洲人群', intervention: '抗血小板药物/抗凝药物/溶栓药物(阿司匹林、氯吡格雷、替格瑞洛、达比加群等)', comparison: '安慰剂或常规治疗', outcome: '卒中进展、复发、残疾程度、死亡率、出血事件等', studyDesign: 'SR、RCT、RWE、OBS' }; const inclusionCriteria = ` 1. 研究对象为非心源性缺血性卒中患者 2. 研究人群为亚洲人群(优先) 3. 干预措施为抗血小板/抗凝/溶栓药物 4. 对照组为安慰剂或常规治疗 5. 研究时间在2020年之后 6. 研究设计为SR、RCT、RWE、OBS `; const exclusionCriteria = ` 1. 综述、病例报告、会议摘要 2. 动物实验、体外实验 3. 研究人群非亚洲人群(除非有特殊价值) 4. 研究时间在2020年之前 5. 心源性卒中或出血性卒中 `; // 读取测试案例 const excelPath = path.join( __dirname, '../../docs/03-业务模块/ASL-AI智能文献/05-测试文档/03-测试数据/screening/Test Cases.xlsx' ); const workbook = XLSX.read(fs.readFileSync(excelPath), { type: 'buffer' }); const data = XLSX.utils.sheet_to_json(workbook.Sheets[workbook.SheetNames[0]]); // 选择测试样本 const includedCases = data.filter((row: any) => row['Decision']?.toString().toLowerCase().includes('include') ).slice(0, 2); const excludedCases = data.filter((row: any) => row['Decision']?.toString().toLowerCase().includes('exclude') ).slice(0, 3); const testCases = [...includedCases, ...excludedCases]; console.log('\n🚀 开始宽松模式测试\n'); console.log(`📊 测试配置:`); console.log(` - 模型组合: DeepSeek-V3 + Qwen-Max`); console.log(` - 筛选风格: 宽松模式(lenient)`); console.log(` - 测试样本: ${testCases.length}篇\n`); interface TestResult { caseIndex: number; title: string; humanDecision: string; aiDecision: string; model1Conclusion: string; model2Conclusion: string; isCorrect: boolean; hasConflict: boolean; confidence: number; reason: string; } async function runTest() { const results: TestResult[] = []; for (let i = 0; i < testCases.length; i++) { const testCase = testCases[i]; const title = testCase['title'] || ''; const abstract = testCase['abstract'] || ''; const humanDecision = testCase['Decision'] || ''; console.log(`[${i + 1}/${testCases.length}] 正在筛选...`); console.log(`标题: ${title.substring(0, 60)}...`); console.log(`人类决策: ${humanDecision}`); try { const screeningResult = await llmScreeningService.dualModelScreening( `test-case-${i + 1}`, title, abstract, picoCriteria, inclusionCriteria, exclusionCriteria, ['deepseek-chat', 'qwen-max'], 'lenient' // ⭐ 使用宽松模式 ); const normalizedHuman = humanDecision.toLowerCase().includes('include') ? 'include' : 'exclude'; const normalizedAI = screeningResult.finalDecision === 'pending' ? 'uncertain' : screeningResult.finalDecision; const isCorrect = normalizedAI === normalizedHuman; console.log(`AI决策: ${screeningResult.finalDecision} ${isCorrect ? '✅' : '❌'}`); console.log(`模型一致: ${!screeningResult.hasConflict ? '✅' : '❌'}`); console.log(`置信度: ${screeningResult.deepseek.confidence.toFixed(2)}\n`); results.push({ caseIndex: i + 1, title: title.substring(0, 100), humanDecision: normalizedHuman, aiDecision: normalizedAI, model1Conclusion: screeningResult.deepseek.conclusion, model2Conclusion: screeningResult.qwen.conclusion, isCorrect, hasConflict: screeningResult.hasConflict, confidence: screeningResult.deepseek.confidence, reason: screeningResult.deepseek.reason }); } catch (error: any) { console.error(`❌ 筛选失败: ${error.message}\n`); } } // 生成对比报告 console.log('\n' + '='.repeat(80)); console.log('📊 宽松模式测试报告'); console.log('='.repeat(80) + '\n'); const correct = results.filter(r => r.isCorrect).length; const consistent = results.filter(r => !r.hasConflict).length; const avgConfidence = results.reduce((sum, r) => sum + r.confidence, 0) / results.length; console.log(`✅ 准确率: ${(correct / results.length * 100).toFixed(1)}% (${correct}/${results.length})`); console.log(`✅ 一致率: ${(consistent / results.length * 100).toFixed(1)}% (${consistent}/${results.length})`); console.log(`✅ 平均置信度: ${avgConfidence.toFixed(2)}\n`); // 按人类决策分组统计 const includedResults = results.filter(r => r.humanDecision === 'include'); const excludedResults = results.filter(r => r.humanDecision === 'exclude'); const includedCorrect = includedResults.filter(r => r.isCorrect).length; const excludedCorrect = excludedResults.filter(r => r.isCorrect).length; console.log('📋 分类准确率:'); console.log(` 应纳入文献 (Included): ${(includedCorrect / includedResults.length * 100).toFixed(1)}% (${includedCorrect}/${includedResults.length})`); console.log(` 应排除文献 (Excluded): ${(excludedCorrect / excludedResults.length * 100).toFixed(1)}% (${excludedCorrect}/${excludedResults.length})\n`); // 详细案例分析 console.log('📝 详细案例分析:\n'); results.forEach(r => { const status = r.isCorrect ? '✅ 正确' : '❌ 错误'; console.log(`[案例 ${r.caseIndex}] ${status}`); console.log(` 标题: ${r.title}`); console.log(` 人类决策: ${r.humanDecision}`); console.log(` AI决策: ${r.aiDecision}`); console.log(` 模型1: ${r.model1Conclusion}, 模型2: ${r.model2Conclusion}`); console.log(` 置信度: ${r.confidence.toFixed(2)}`); if (!r.isCorrect) { console.log(` AI理由: ${r.reason.substring(0, 150)}...`); } console.log(''); }); // 与标准模式对比 console.log('='.repeat(80)); console.log('🔄 与标准模式对比\n'); console.log('| 指标 | 标准模式 | 宽松模式 | 改进 |'); console.log('|------|----------|----------|------|'); console.log(`| 准确率 | 60% | ${(correct / results.length * 100).toFixed(1)}% | ${(correct / results.length * 100 - 60).toFixed(1)}% |`); console.log(`| 召回率(Included) | 0% | ${(includedCorrect / includedResults.length * 100).toFixed(1)}% | ${(includedCorrect / includedResults.length * 100).toFixed(1)}% |`); console.log(`| 排除准确率 | 100% | ${(excludedCorrect / excludedResults.length * 100).toFixed(1)}% | ${(excludedCorrect / excludedResults.length * 100 - 100).toFixed(1)}% |`); console.log('\n' + '='.repeat(80)); // 结论 if (correct / results.length >= 0.8) { console.log('\n🎉 宽松模式效果显著!准确率≥80%'); console.log('💡 建议: 初筛使用宽松模式,全文复筛使用严格模式'); } else if (correct / results.length >= 0.6) { console.log('\n⚠️ 宽松模式有改进,但仍需优化'); console.log('💡 建议: 继续调整Prompt或考虑增加Few-shot示例'); } else { console.log('\n❌ 宽松模式改进有限'); console.log('💡 建议: 问题不在宽松/严格,而在PICOS标准的理解差异'); console.log(' → 需要实现用户自定义边界情况功能'); } console.log('\n✅ 测试完成!\n'); } runTest().catch(console.error);