/** * 卒中数据测试 - 国际模型对比 * * 目的:对比国内模型(DeepSeek+Qwen)vs 国际模型(GPT-4o+Claude) * * 测试假设: * 1. 如果国际模型准确率更高 → 是模型能力问题 * 2. 如果国际模型准确率相似 → 是Prompt或理解差异问题 */ import * as fs from 'fs'; import * as path from 'path'; import * as XLSX from 'xlsx'; import { fileURLToPath } from 'url'; import { llmScreeningService } from '../src/modules/asl/services/llmScreeningService.js'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); // ======================================== // 📋 1. 读取PICOS和标准 // ======================================== console.log('📖 正在读取PICOS和纳排标准...\n'); const picosPath = path.join( __dirname, '../../docs/03-业务模块/ASL-AI智能文献/05-测试文档/03-测试数据/screening/测试案例的PICOS、纳入标准、排除标准.txt' ); const picosContent = fs.readFileSync(picosPath, 'utf-8'); // 解析PICOS(简化版) const picoCriteria = { population: '非心源性缺血性卒中患者、亚洲人群', intervention: '抗血小板药物/抗凝药物/溶栓药物(阿司匹林、氯吡格雷、替格瑞洛、达比加群等)', comparison: '安慰剂或常规治疗', outcome: '卒中进展、复发、残疾程度、死亡率、出血事件等', studyDesign: 'SR、RCT、RWE、OBS' }; const inclusionCriteria = ` 1. 研究对象为非心源性缺血性卒中患者 2. 研究人群为亚洲人群(优先) 3. 干预措施为抗血小板/抗凝/溶栓药物 4. 对照组为安慰剂或常规治疗 5. 研究时间在2020年之后 6. 研究设计为SR、RCT、RWE、OBS `; const exclusionCriteria = ` 1. 综述、病例报告、会议摘要 2. 动物实验、体外实验 3. 研究人群非亚洲人群(除非有特殊价值) 4. 研究时间在2020年之前 5. 心源性卒中或出血性卒中 `; console.log('✅ PICOS标准已加载\n'); // ======================================== // 📋 2. 读取测试案例 // ======================================== console.log('📖 正在读取测试案例...\n'); const excelPath = path.join( __dirname, '../../docs/03-业务模块/ASL-AI智能文献/05-测试文档/03-测试数据/screening/Test Cases.xlsx' ); const workbook = XLSX.read(fs.readFileSync(excelPath), { type: 'buffer' }); const sheetName = workbook.SheetNames[0]; const worksheet = workbook.Sheets[sheetName]; const data = XLSX.utils.sheet_to_json(worksheet); console.log(`✅ 读取到 ${data.length} 条数据\n`); // 选择测试样本:2个Included + 3个Excluded const includedCases = data.filter((row: any) => row['Decision']?.toString().toLowerCase().includes('include') ).slice(0, 2); const excludedCases = data.filter((row: any) => row['Decision']?.toString().toLowerCase().includes('exclude') ).slice(0, 3); const testCases = [...includedCases, ...excludedCases]; console.log(`✅ 选择测试样本: ${testCases.length}篇(2 Included + 3 Excluded)\n`); // ======================================== // 🧪 3. 定义测试模型组合 // ======================================== const modelPairs = [ { name: '国内模型组合', model1: 'deepseek-chat', model2: 'qwen3-72b', description: 'DeepSeek-V3 + Qwen3-Max(当前使用)' }, { name: '国际模型组合', model1: 'gpt-4o', model2: 'claude-sonnet-4.5', description: 'GPT-4o + Claude-4.5(国际顶级模型)' } ]; // ======================================== // 🧪 4. 执行测试 // ======================================== interface TestResult { caseIndex: number; title: string; humanDecision: string; aiDecision: string; model1Result: any; model2Result: any; isCorrect: boolean; hasConflict: boolean; processingTime: number; } async function testModelPair( pairName: string, model1: string, model2: string, cases: any[] ): Promise { console.log(`\n${'='.repeat(60)}`); console.log(`🧪 测试模型组合: ${pairName}`); console.log(`${'='.repeat(60)}\n`); const results: TestResult[] = []; for (let i = 0; i < cases.length; i++) { const testCase = cases[i]; const title = testCase['title'] || ''; const abstract = testCase['abstract'] || ''; const humanDecision = testCase['Decision'] || ''; console.log(`\n[${i + 1}/${cases.length}] 正在筛选...`); console.log(`标题: ${title.substring(0, 60)}...`); console.log(`人类决策: ${humanDecision}`); const startTime = Date.now(); try { const screeningResult = await llmScreeningService.dualModelScreening( `test-case-${i + 1}`, // literatureId title, abstract, picoCriteria, inclusionCriteria, exclusionCriteria, [model1, model2], // models参数应该是一个数组 'standard' // style参数 ); const processingTime = Date.now() - startTime; // 标准化决策 const normalizedHuman = humanDecision.toLowerCase().includes('include') ? 'include' : 'exclude'; const normalizedAI = screeningResult.finalDecision === 'pending' ? 'uncertain' : screeningResult.finalDecision; const isCorrect = normalizedAI === normalizedHuman; console.log(`AI决策: ${screeningResult.finalDecision} ${isCorrect ? '✅' : '❌'}`); console.log(`模型一致: ${!screeningResult.hasConflict ? '✅' : '❌'}`); console.log(`处理时间: ${(processingTime / 1000).toFixed(2)}秒`); results.push({ caseIndex: i + 1, title: title.substring(0, 100), humanDecision: normalizedHuman, aiDecision: normalizedAI, model1Result: screeningResult.model1Result, model2Result: screeningResult.model2Result, isCorrect, hasConflict: screeningResult.hasConflict, processingTime }); } catch (error: any) { console.error(`❌ 筛选失败: ${error.message}`); results.push({ caseIndex: i + 1, title: title.substring(0, 100), humanDecision: humanDecision.toLowerCase().includes('include') ? 'include' : 'exclude', aiDecision: 'error', model1Result: null, model2Result: null, isCorrect: false, hasConflict: false, processingTime: Date.now() - startTime }); } } return results; } // ======================================== // 📊 5. 生成对比报告 // ======================================== function generateComparisonReport( domesticResults: TestResult[], internationalResults: TestResult[] ) { console.log(`\n${'='.repeat(80)}`); console.log(`📊 国内 vs 国际模型对比报告`); console.log(`${'='.repeat(80)}\n`); // 计算指标 function calculateMetrics(results: TestResult[]) { const total = results.length; const correct = results.filter(r => r.isCorrect).length; const consistent = results.filter(r => !r.hasConflict).length; const avgTime = results.reduce((sum, r) => sum + r.processingTime, 0) / total; return { accuracy: (correct / total * 100).toFixed(1), consistency: (consistent / total * 100).toFixed(1), avgTime: (avgTime / 1000).toFixed(2), correct, total }; } const domesticMetrics = calculateMetrics(domesticResults); const internationalMetrics = calculateMetrics(internationalResults); // 对比表格 console.log('| 指标 | 国内模型 | 国际模型 | 差异 |'); console.log('|------|----------|----------|------|'); console.log(`| 准确率 | ${domesticMetrics.accuracy}% (${domesticMetrics.correct}/${domesticMetrics.total}) | ${internationalMetrics.accuracy}% (${internationalMetrics.correct}/${internationalMetrics.total}) | ${(parseFloat(internationalMetrics.accuracy) - parseFloat(domesticMetrics.accuracy)).toFixed(1)}% |`); console.log(`| 一致率 | ${domesticMetrics.consistency}% | ${internationalMetrics.consistency}% | ${(parseFloat(internationalMetrics.consistency) - parseFloat(domesticMetrics.consistency)).toFixed(1)}% |`); console.log(`| 平均耗时 | ${domesticMetrics.avgTime}秒 | ${internationalMetrics.avgTime}秒 | ${(parseFloat(internationalMetrics.avgTime) - parseFloat(domesticMetrics.avgTime)).toFixed(2)}秒 |`); console.log('\n'); // 逐案例对比 console.log('📋 逐案例对比:\n'); for (let i = 0; i < domesticResults.length; i++) { const domestic = domesticResults[i]; const international = internationalResults[i]; console.log(`[案例 ${i + 1}] ${domestic.title}`); console.log(` 人类: ${domestic.humanDecision}`); console.log(` 国内模型: ${domestic.aiDecision} ${domestic.isCorrect ? '✅' : '❌'}`); console.log(` 国际模型: ${international.aiDecision} ${international.isCorrect ? '✅' : '❌'}`); if (domestic.aiDecision !== international.aiDecision) { console.log(` ⚠️ 两组模型判断不一致!`); } console.log(''); } // 结论分析 console.log('\n' + '='.repeat(80)); console.log('🎯 结论分析\n'); const accuracyDiff = parseFloat(internationalMetrics.accuracy) - parseFloat(domesticMetrics.accuracy); if (Math.abs(accuracyDiff) <= 10) { console.log('✅ 结论: 国内外模型准确率相近(差异≤10%)'); console.log(' → 问题不在模型能力,而在于:'); console.log(' 1. Prompt设计(可能过于严格)'); console.log(' 2. AI vs 人类对"匹配"的理解差异'); console.log(' 3. 纳排标准本身存在歧义'); console.log('\n💡 建议: 优化Prompt策略,增加宽松/标准/严格三种模式'); } else if (accuracyDiff > 10) { console.log('✅ 结论: 国际模型显著优于国内模型(差异>10%)'); console.log(' → 问题在于模型能力差异'); console.log(' → 国际模型对医学文献的理解更准确'); console.log('\n💡 建议: 优先使用GPT-4o或Claude-4.5进行筛选'); } else { console.log('✅ 结论: 国内模型优于国际模型(差异>10%)'); console.log(' → 可能是国内模型对中文医学术语理解更好'); console.log(' → 或者国内模型更符合中国专家的筛选习惯'); console.log('\n💡 建议: 继续使用国内模型组合'); } console.log('='.repeat(80) + '\n'); // 保存详细报告 const report = { testDate: new Date().toISOString(), testCases: testCases.length, domesticModels: modelPairs[0], internationalModels: modelPairs[1], domesticMetrics, internationalMetrics, domesticResults, internationalResults, conclusion: { accuracyDiff, analysis: Math.abs(accuracyDiff) <= 10 ? 'Prompt问题' : (accuracyDiff > 10 ? '国际模型更优' : '国内模型更优') } }; const reportPath = path.join(__dirname, '../docs/国内外模型对比测试报告.json'); fs.writeFileSync(reportPath, JSON.stringify(report, null, 2), 'utf-8'); console.log(`📄 详细报告已保存: ${reportPath}\n`); } // ======================================== // 🚀 6. 执行主流程 // ======================================== async function main() { console.log('\n🚀 开始国内外模型对比测试\n'); console.log(`测试样本: ${testCases.length}篇`); console.log(`测试组合: 2组`); console.log(`预计耗时: ${testCases.length * 2 * 15}秒(约${Math.ceil(testCases.length * 2 * 15 / 60)}分钟)\n`); // 测试国内模型 const domesticResults = await testModelPair( modelPairs[0].name, modelPairs[0].model1, modelPairs[0].model2, testCases ); // 等待2秒,避免API限流 console.log('\n⏳ 等待2秒后测试国际模型...\n'); await new Promise(resolve => setTimeout(resolve, 2000)); // 测试国际模型 const internationalResults = await testModelPair( modelPairs[1].name, modelPairs[1].model1, modelPairs[1].model2, testCases ); // 生成对比报告 generateComparisonReport(domesticResults, internationalResults); console.log('✅ 测试完成!\n'); } main().catch(console.error);