Files
AIclinicalresearch/backend/scripts/test-stroke-screening-international-models.ts
HaHafeng 3634933ece refactor(asl): ASL frontend architecture refactoring with left navigation
- feat: Create ASLLayout component with 7-module left navigation
- feat: Implement Title Screening Settings page with optimized PICOS layout
- feat: Add placeholder pages for Workbench and Results
- fix: Fix nested routing structure for React Router v6
- fix: Resolve Spin component warning in MainLayout
- fix: Add QueryClientProvider to App.tsx
- style: Optimize PICOS form layout (P+I left, C+O+S right)
- style: Align Inclusion/Exclusion criteria side-by-side
- docs: Add architecture refactoring and routing fix reports

Ref: Week 2 Frontend Development
Scope: ASL module MVP - Title Abstract Screening
2025-11-18 21:51:51 +08:00

349 lines
12 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* 卒中数据测试 - 国际模型对比
*
* 目的对比国内模型DeepSeek+Qwenvs 国际模型GPT-4o+Claude
*
* 测试假设:
* 1. 如果国际模型准确率更高 → 是模型能力问题
* 2. 如果国际模型准确率相似 → 是Prompt或理解差异问题
*/
import * as fs from 'fs';
import * as path from 'path';
import * as XLSX from 'xlsx';
import { fileURLToPath } from 'url';
import { llmScreeningService } from '../src/modules/asl/services/llmScreeningService.js';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// ========================================
// 📋 1. 读取PICOS和标准
// ========================================
console.log('📖 正在读取PICOS和纳排标准...\n');
const picosPath = path.join(
__dirname,
'../../docs/03-业务模块/ASL-AI智能文献/05-测试文档/03-测试数据/screening/测试案例的PICOS、纳入标准、排除标准.txt'
);
const picosContent = fs.readFileSync(picosPath, 'utf-8');
// 解析PICOS简化版
const picoCriteria = {
population: '非心源性缺血性卒中患者、亚洲人群',
intervention: '抗血小板药物/抗凝药物/溶栓药物(阿司匹林、氯吡格雷、替格瑞洛、达比加群等)',
comparison: '安慰剂或常规治疗',
outcome: '卒中进展、复发、残疾程度、死亡率、出血事件等',
studyDesign: 'SR、RCT、RWE、OBS'
};
const inclusionCriteria = `
1. 研究对象为非心源性缺血性卒中患者
2. 研究人群为亚洲人群(优先)
3. 干预措施为抗血小板/抗凝/溶栓药物
4. 对照组为安慰剂或常规治疗
5. 研究时间在2020年之后
6. 研究设计为SR、RCT、RWE、OBS
`;
const exclusionCriteria = `
1. 综述、病例报告、会议摘要
2. 动物实验、体外实验
3. 研究人群非亚洲人群(除非有特殊价值)
4. 研究时间在2020年之前
5. 心源性卒中或出血性卒中
`;
console.log('✅ PICOS标准已加载\n');
// ========================================
// 📋 2. 读取测试案例
// ========================================
console.log('📖 正在读取测试案例...\n');
const excelPath = path.join(
__dirname,
'../../docs/03-业务模块/ASL-AI智能文献/05-测试文档/03-测试数据/screening/Test Cases.xlsx'
);
const workbook = XLSX.read(fs.readFileSync(excelPath), { type: 'buffer' });
const sheetName = workbook.SheetNames[0];
const worksheet = workbook.Sheets[sheetName];
const data = XLSX.utils.sheet_to_json(worksheet);
console.log(`✅ 读取到 ${data.length} 条数据\n`);
// 选择测试样本2个Included + 3个Excluded
const includedCases = data.filter((row: any) =>
row['Decision']?.toString().toLowerCase().includes('include')
).slice(0, 2);
const excludedCases = data.filter((row: any) =>
row['Decision']?.toString().toLowerCase().includes('exclude')
).slice(0, 3);
const testCases = [...includedCases, ...excludedCases];
console.log(`✅ 选择测试样本: ${testCases.length}2 Included + 3 Excluded\n`);
// ========================================
// 🧪 3. 定义测试模型组合
// ========================================
const modelPairs = [
{
name: '国内模型组合',
model1: 'deepseek-chat',
model2: 'qwen3-72b',
description: 'DeepSeek-V3 + Qwen3-Max当前使用'
},
{
name: '国际模型组合',
model1: 'gpt-4o',
model2: 'claude-sonnet-4.5',
description: 'GPT-4o + Claude-4.5(国际顶级模型)'
}
];
// ========================================
// 🧪 4. 执行测试
// ========================================
interface TestResult {
caseIndex: number;
title: string;
humanDecision: string;
aiDecision: string;
model1Result: any;
model2Result: any;
isCorrect: boolean;
hasConflict: boolean;
processingTime: number;
}
async function testModelPair(
pairName: string,
model1: string,
model2: string,
cases: any[]
): Promise<TestResult[]> {
console.log(`\n${'='.repeat(60)}`);
console.log(`🧪 测试模型组合: ${pairName}`);
console.log(`${'='.repeat(60)}\n`);
const results: TestResult[] = [];
for (let i = 0; i < cases.length; i++) {
const testCase = cases[i];
const title = testCase['title'] || '';
const abstract = testCase['abstract'] || '';
const humanDecision = testCase['Decision'] || '';
console.log(`\n[${i + 1}/${cases.length}] 正在筛选...`);
console.log(`标题: ${title.substring(0, 60)}...`);
console.log(`人类决策: ${humanDecision}`);
const startTime = Date.now();
try {
const screeningResult = await llmScreeningService.dualModelScreening(
`test-case-${i + 1}`, // literatureId
title,
abstract,
picoCriteria,
inclusionCriteria,
exclusionCriteria,
[model1, model2], // models参数应该是一个数组
'standard' // style参数
);
const processingTime = Date.now() - startTime;
// 标准化决策
const normalizedHuman = humanDecision.toLowerCase().includes('include') ? 'include' : 'exclude';
const normalizedAI = screeningResult.finalDecision === 'pending' ? 'uncertain' : screeningResult.finalDecision;
const isCorrect = normalizedAI === normalizedHuman;
console.log(`AI决策: ${screeningResult.finalDecision} ${isCorrect ? '✅' : '❌'}`);
console.log(`模型一致: ${!screeningResult.hasConflict ? '✅' : '❌'}`);
console.log(`处理时间: ${(processingTime / 1000).toFixed(2)}`);
results.push({
caseIndex: i + 1,
title: title.substring(0, 100),
humanDecision: normalizedHuman,
aiDecision: normalizedAI,
model1Result: screeningResult.model1Result,
model2Result: screeningResult.model2Result,
isCorrect,
hasConflict: screeningResult.hasConflict,
processingTime
});
} catch (error: any) {
console.error(`❌ 筛选失败: ${error.message}`);
results.push({
caseIndex: i + 1,
title: title.substring(0, 100),
humanDecision: humanDecision.toLowerCase().includes('include') ? 'include' : 'exclude',
aiDecision: 'error',
model1Result: null,
model2Result: null,
isCorrect: false,
hasConflict: false,
processingTime: Date.now() - startTime
});
}
}
return results;
}
// ========================================
// 📊 5. 生成对比报告
// ========================================
function generateComparisonReport(
domesticResults: TestResult[],
internationalResults: TestResult[]
) {
console.log(`\n${'='.repeat(80)}`);
console.log(`📊 国内 vs 国际模型对比报告`);
console.log(`${'='.repeat(80)}\n`);
// 计算指标
function calculateMetrics(results: TestResult[]) {
const total = results.length;
const correct = results.filter(r => r.isCorrect).length;
const consistent = results.filter(r => !r.hasConflict).length;
const avgTime = results.reduce((sum, r) => sum + r.processingTime, 0) / total;
return {
accuracy: (correct / total * 100).toFixed(1),
consistency: (consistent / total * 100).toFixed(1),
avgTime: (avgTime / 1000).toFixed(2),
correct,
total
};
}
const domesticMetrics = calculateMetrics(domesticResults);
const internationalMetrics = calculateMetrics(internationalResults);
// 对比表格
console.log('| 指标 | 国内模型 | 国际模型 | 差异 |');
console.log('|------|----------|----------|------|');
console.log(`| 准确率 | ${domesticMetrics.accuracy}% (${domesticMetrics.correct}/${domesticMetrics.total}) | ${internationalMetrics.accuracy}% (${internationalMetrics.correct}/${internationalMetrics.total}) | ${(parseFloat(internationalMetrics.accuracy) - parseFloat(domesticMetrics.accuracy)).toFixed(1)}% |`);
console.log(`| 一致率 | ${domesticMetrics.consistency}% | ${internationalMetrics.consistency}% | ${(parseFloat(internationalMetrics.consistency) - parseFloat(domesticMetrics.consistency)).toFixed(1)}% |`);
console.log(`| 平均耗时 | ${domesticMetrics.avgTime}秒 | ${internationalMetrics.avgTime}秒 | ${(parseFloat(internationalMetrics.avgTime) - parseFloat(domesticMetrics.avgTime)).toFixed(2)}秒 |`);
console.log('\n');
// 逐案例对比
console.log('📋 逐案例对比:\n');
for (let i = 0; i < domesticResults.length; i++) {
const domestic = domesticResults[i];
const international = internationalResults[i];
console.log(`[案例 ${i + 1}] ${domestic.title}`);
console.log(` 人类: ${domestic.humanDecision}`);
console.log(` 国内模型: ${domestic.aiDecision} ${domestic.isCorrect ? '✅' : '❌'}`);
console.log(` 国际模型: ${international.aiDecision} ${international.isCorrect ? '✅' : '❌'}`);
if (domestic.aiDecision !== international.aiDecision) {
console.log(` ⚠️ 两组模型判断不一致!`);
}
console.log('');
}
// 结论分析
console.log('\n' + '='.repeat(80));
console.log('🎯 结论分析\n');
const accuracyDiff = parseFloat(internationalMetrics.accuracy) - parseFloat(domesticMetrics.accuracy);
if (Math.abs(accuracyDiff) <= 10) {
console.log('✅ 结论: 国内外模型准确率相近差异≤10%');
console.log(' → 问题不在模型能力,而在于:');
console.log(' 1. Prompt设计可能过于严格');
console.log(' 2. AI vs 人类对"匹配"的理解差异');
console.log(' 3. 纳排标准本身存在歧义');
console.log('\n💡 建议: 优化Prompt策略增加宽松/标准/严格三种模式');
} else if (accuracyDiff > 10) {
console.log('✅ 结论: 国际模型显著优于国内模型(差异>10%');
console.log(' → 问题在于模型能力差异');
console.log(' → 国际模型对医学文献的理解更准确');
console.log('\n💡 建议: 优先使用GPT-4o或Claude-4.5进行筛选');
} else {
console.log('✅ 结论: 国内模型优于国际模型(差异>10%');
console.log(' → 可能是国内模型对中文医学术语理解更好');
console.log(' → 或者国内模型更符合中国专家的筛选习惯');
console.log('\n💡 建议: 继续使用国内模型组合');
}
console.log('='.repeat(80) + '\n');
// 保存详细报告
const report = {
testDate: new Date().toISOString(),
testCases: testCases.length,
domesticModels: modelPairs[0],
internationalModels: modelPairs[1],
domesticMetrics,
internationalMetrics,
domesticResults,
internationalResults,
conclusion: {
accuracyDiff,
analysis: Math.abs(accuracyDiff) <= 10 ? 'Prompt问题' : (accuracyDiff > 10 ? '国际模型更优' : '国内模型更优')
}
};
const reportPath = path.join(__dirname, '../docs/国内外模型对比测试报告.json');
fs.writeFileSync(reportPath, JSON.stringify(report, null, 2), 'utf-8');
console.log(`📄 详细报告已保存: ${reportPath}\n`);
}
// ========================================
// 🚀 6. 执行主流程
// ========================================
async function main() {
console.log('\n🚀 开始国内外模型对比测试\n');
console.log(`测试样本: ${testCases.length}`);
console.log(`测试组合: 2组`);
console.log(`预计耗时: ${testCases.length * 2 * 15}秒(约${Math.ceil(testCases.length * 2 * 15 / 60)}分钟)\n`);
// 测试国内模型
const domesticResults = await testModelPair(
modelPairs[0].name,
modelPairs[0].model1,
modelPairs[0].model2,
testCases
);
// 等待2秒避免API限流
console.log('\n⏳ 等待2秒后测试国际模型...\n');
await new Promise(resolve => setTimeout(resolve, 2000));
// 测试国际模型
const internationalResults = await testModelPair(
modelPairs[1].name,
modelPairs[1].model1,
modelPairs[1].model2,
testCases
);
// 生成对比报告
generateComparisonReport(domesticResults, internationalResults);
console.log('✅ 测试完成!\n');
}
main().catch(console.error);