- feat: Create ASLLayout component with 7-module left navigation - feat: Implement Title Screening Settings page with optimized PICOS layout - feat: Add placeholder pages for Workbench and Results - fix: Fix nested routing structure for React Router v6 - fix: Resolve Spin component warning in MainLayout - fix: Add QueryClientProvider to App.tsx - style: Optimize PICOS form layout (P+I left, C+O+S right) - style: Align Inclusion/Exclusion criteria side-by-side - docs: Add architecture refactoring and routing fix reports Ref: Week 2 Frontend Development Scope: ASL module MVP - Title Abstract Screening
349 lines
12 KiB
TypeScript
349 lines
12 KiB
TypeScript
/**
|
||
* 卒中数据测试 - 国际模型对比
|
||
*
|
||
* 目的:对比国内模型(DeepSeek+Qwen)vs 国际模型(GPT-4o+Claude)
|
||
*
|
||
* 测试假设:
|
||
* 1. 如果国际模型准确率更高 → 是模型能力问题
|
||
* 2. 如果国际模型准确率相似 → 是Prompt或理解差异问题
|
||
*/
|
||
|
||
import * as fs from 'fs';
|
||
import * as path from 'path';
|
||
import * as XLSX from 'xlsx';
|
||
import { fileURLToPath } from 'url';
|
||
import { llmScreeningService } from '../src/modules/asl/services/llmScreeningService.js';
|
||
|
||
const __filename = fileURLToPath(import.meta.url);
|
||
const __dirname = path.dirname(__filename);
|
||
|
||
// ========================================
|
||
// 📋 1. 读取PICOS和标准
|
||
// ========================================
|
||
|
||
console.log('📖 正在读取PICOS和纳排标准...\n');
|
||
|
||
const picosPath = path.join(
|
||
__dirname,
|
||
'../../docs/03-业务模块/ASL-AI智能文献/05-测试文档/03-测试数据/screening/测试案例的PICOS、纳入标准、排除标准.txt'
|
||
);
|
||
|
||
const picosContent = fs.readFileSync(picosPath, 'utf-8');
|
||
|
||
// 解析PICOS(简化版)
|
||
const picoCriteria = {
|
||
population: '非心源性缺血性卒中患者、亚洲人群',
|
||
intervention: '抗血小板药物/抗凝药物/溶栓药物(阿司匹林、氯吡格雷、替格瑞洛、达比加群等)',
|
||
comparison: '安慰剂或常规治疗',
|
||
outcome: '卒中进展、复发、残疾程度、死亡率、出血事件等',
|
||
studyDesign: 'SR、RCT、RWE、OBS'
|
||
};
|
||
|
||
const inclusionCriteria = `
|
||
1. 研究对象为非心源性缺血性卒中患者
|
||
2. 研究人群为亚洲人群(优先)
|
||
3. 干预措施为抗血小板/抗凝/溶栓药物
|
||
4. 对照组为安慰剂或常规治疗
|
||
5. 研究时间在2020年之后
|
||
6. 研究设计为SR、RCT、RWE、OBS
|
||
`;
|
||
|
||
const exclusionCriteria = `
|
||
1. 综述、病例报告、会议摘要
|
||
2. 动物实验、体外实验
|
||
3. 研究人群非亚洲人群(除非有特殊价值)
|
||
4. 研究时间在2020年之前
|
||
5. 心源性卒中或出血性卒中
|
||
`;
|
||
|
||
console.log('✅ PICOS标准已加载\n');
|
||
|
||
// ========================================
|
||
// 📋 2. 读取测试案例
|
||
// ========================================
|
||
|
||
console.log('📖 正在读取测试案例...\n');
|
||
|
||
const excelPath = path.join(
|
||
__dirname,
|
||
'../../docs/03-业务模块/ASL-AI智能文献/05-测试文档/03-测试数据/screening/Test Cases.xlsx'
|
||
);
|
||
|
||
const workbook = XLSX.read(fs.readFileSync(excelPath), { type: 'buffer' });
|
||
const sheetName = workbook.SheetNames[0];
|
||
const worksheet = workbook.Sheets[sheetName];
|
||
const data = XLSX.utils.sheet_to_json(worksheet);
|
||
|
||
console.log(`✅ 读取到 ${data.length} 条数据\n`);
|
||
|
||
// 选择测试样本:2个Included + 3个Excluded
|
||
const includedCases = data.filter((row: any) =>
|
||
row['Decision']?.toString().toLowerCase().includes('include')
|
||
).slice(0, 2);
|
||
|
||
const excludedCases = data.filter((row: any) =>
|
||
row['Decision']?.toString().toLowerCase().includes('exclude')
|
||
).slice(0, 3);
|
||
|
||
const testCases = [...includedCases, ...excludedCases];
|
||
|
||
console.log(`✅ 选择测试样本: ${testCases.length}篇(2 Included + 3 Excluded)\n`);
|
||
|
||
// ========================================
|
||
// 🧪 3. 定义测试模型组合
|
||
// ========================================
|
||
|
||
const modelPairs = [
|
||
{
|
||
name: '国内模型组合',
|
||
model1: 'deepseek-chat',
|
||
model2: 'qwen3-72b',
|
||
description: 'DeepSeek-V3 + Qwen3-Max(当前使用)'
|
||
},
|
||
{
|
||
name: '国际模型组合',
|
||
model1: 'gpt-4o',
|
||
model2: 'claude-sonnet-4.5',
|
||
description: 'GPT-4o + Claude-4.5(国际顶级模型)'
|
||
}
|
||
];
|
||
|
||
// ========================================
|
||
// 🧪 4. 执行测试
|
||
// ========================================
|
||
|
||
interface TestResult {
|
||
caseIndex: number;
|
||
title: string;
|
||
humanDecision: string;
|
||
aiDecision: string;
|
||
model1Result: any;
|
||
model2Result: any;
|
||
isCorrect: boolean;
|
||
hasConflict: boolean;
|
||
processingTime: number;
|
||
}
|
||
|
||
async function testModelPair(
|
||
pairName: string,
|
||
model1: string,
|
||
model2: string,
|
||
cases: any[]
|
||
): Promise<TestResult[]> {
|
||
console.log(`\n${'='.repeat(60)}`);
|
||
console.log(`🧪 测试模型组合: ${pairName}`);
|
||
console.log(`${'='.repeat(60)}\n`);
|
||
|
||
const results: TestResult[] = [];
|
||
|
||
for (let i = 0; i < cases.length; i++) {
|
||
const testCase = cases[i];
|
||
const title = testCase['title'] || '';
|
||
const abstract = testCase['abstract'] || '';
|
||
const humanDecision = testCase['Decision'] || '';
|
||
|
||
console.log(`\n[${i + 1}/${cases.length}] 正在筛选...`);
|
||
console.log(`标题: ${title.substring(0, 60)}...`);
|
||
console.log(`人类决策: ${humanDecision}`);
|
||
|
||
const startTime = Date.now();
|
||
|
||
try {
|
||
const screeningResult = await llmScreeningService.dualModelScreening(
|
||
`test-case-${i + 1}`, // literatureId
|
||
title,
|
||
abstract,
|
||
picoCriteria,
|
||
inclusionCriteria,
|
||
exclusionCriteria,
|
||
[model1, model2], // models参数应该是一个数组
|
||
'standard' // style参数
|
||
);
|
||
|
||
const processingTime = Date.now() - startTime;
|
||
|
||
// 标准化决策
|
||
const normalizedHuman = humanDecision.toLowerCase().includes('include') ? 'include' : 'exclude';
|
||
const normalizedAI = screeningResult.finalDecision === 'pending' ? 'uncertain' : screeningResult.finalDecision;
|
||
|
||
const isCorrect = normalizedAI === normalizedHuman;
|
||
|
||
console.log(`AI决策: ${screeningResult.finalDecision} ${isCorrect ? '✅' : '❌'}`);
|
||
console.log(`模型一致: ${!screeningResult.hasConflict ? '✅' : '❌'}`);
|
||
console.log(`处理时间: ${(processingTime / 1000).toFixed(2)}秒`);
|
||
|
||
results.push({
|
||
caseIndex: i + 1,
|
||
title: title.substring(0, 100),
|
||
humanDecision: normalizedHuman,
|
||
aiDecision: normalizedAI,
|
||
model1Result: screeningResult.model1Result,
|
||
model2Result: screeningResult.model2Result,
|
||
isCorrect,
|
||
hasConflict: screeningResult.hasConflict,
|
||
processingTime
|
||
});
|
||
|
||
} catch (error: any) {
|
||
console.error(`❌ 筛选失败: ${error.message}`);
|
||
results.push({
|
||
caseIndex: i + 1,
|
||
title: title.substring(0, 100),
|
||
humanDecision: humanDecision.toLowerCase().includes('include') ? 'include' : 'exclude',
|
||
aiDecision: 'error',
|
||
model1Result: null,
|
||
model2Result: null,
|
||
isCorrect: false,
|
||
hasConflict: false,
|
||
processingTime: Date.now() - startTime
|
||
});
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
|
||
// ========================================
|
||
// 📊 5. 生成对比报告
|
||
// ========================================
|
||
|
||
function generateComparisonReport(
|
||
domesticResults: TestResult[],
|
||
internationalResults: TestResult[]
|
||
) {
|
||
console.log(`\n${'='.repeat(80)}`);
|
||
console.log(`📊 国内 vs 国际模型对比报告`);
|
||
console.log(`${'='.repeat(80)}\n`);
|
||
|
||
// 计算指标
|
||
function calculateMetrics(results: TestResult[]) {
|
||
const total = results.length;
|
||
const correct = results.filter(r => r.isCorrect).length;
|
||
const consistent = results.filter(r => !r.hasConflict).length;
|
||
const avgTime = results.reduce((sum, r) => sum + r.processingTime, 0) / total;
|
||
|
||
return {
|
||
accuracy: (correct / total * 100).toFixed(1),
|
||
consistency: (consistent / total * 100).toFixed(1),
|
||
avgTime: (avgTime / 1000).toFixed(2),
|
||
correct,
|
||
total
|
||
};
|
||
}
|
||
|
||
const domesticMetrics = calculateMetrics(domesticResults);
|
||
const internationalMetrics = calculateMetrics(internationalResults);
|
||
|
||
// 对比表格
|
||
console.log('| 指标 | 国内模型 | 国际模型 | 差异 |');
|
||
console.log('|------|----------|----------|------|');
|
||
console.log(`| 准确率 | ${domesticMetrics.accuracy}% (${domesticMetrics.correct}/${domesticMetrics.total}) | ${internationalMetrics.accuracy}% (${internationalMetrics.correct}/${internationalMetrics.total}) | ${(parseFloat(internationalMetrics.accuracy) - parseFloat(domesticMetrics.accuracy)).toFixed(1)}% |`);
|
||
console.log(`| 一致率 | ${domesticMetrics.consistency}% | ${internationalMetrics.consistency}% | ${(parseFloat(internationalMetrics.consistency) - parseFloat(domesticMetrics.consistency)).toFixed(1)}% |`);
|
||
console.log(`| 平均耗时 | ${domesticMetrics.avgTime}秒 | ${internationalMetrics.avgTime}秒 | ${(parseFloat(internationalMetrics.avgTime) - parseFloat(domesticMetrics.avgTime)).toFixed(2)}秒 |`);
|
||
|
||
console.log('\n');
|
||
|
||
// 逐案例对比
|
||
console.log('📋 逐案例对比:\n');
|
||
for (let i = 0; i < domesticResults.length; i++) {
|
||
const domestic = domesticResults[i];
|
||
const international = internationalResults[i];
|
||
|
||
console.log(`[案例 ${i + 1}] ${domestic.title}`);
|
||
console.log(` 人类: ${domestic.humanDecision}`);
|
||
console.log(` 国内模型: ${domestic.aiDecision} ${domestic.isCorrect ? '✅' : '❌'}`);
|
||
console.log(` 国际模型: ${international.aiDecision} ${international.isCorrect ? '✅' : '❌'}`);
|
||
|
||
if (domestic.aiDecision !== international.aiDecision) {
|
||
console.log(` ⚠️ 两组模型判断不一致!`);
|
||
}
|
||
console.log('');
|
||
}
|
||
|
||
// 结论分析
|
||
console.log('\n' + '='.repeat(80));
|
||
console.log('🎯 结论分析\n');
|
||
|
||
const accuracyDiff = parseFloat(internationalMetrics.accuracy) - parseFloat(domesticMetrics.accuracy);
|
||
|
||
if (Math.abs(accuracyDiff) <= 10) {
|
||
console.log('✅ 结论: 国内外模型准确率相近(差异≤10%)');
|
||
console.log(' → 问题不在模型能力,而在于:');
|
||
console.log(' 1. Prompt设计(可能过于严格)');
|
||
console.log(' 2. AI vs 人类对"匹配"的理解差异');
|
||
console.log(' 3. 纳排标准本身存在歧义');
|
||
console.log('\n💡 建议: 优化Prompt策略,增加宽松/标准/严格三种模式');
|
||
} else if (accuracyDiff > 10) {
|
||
console.log('✅ 结论: 国际模型显著优于国内模型(差异>10%)');
|
||
console.log(' → 问题在于模型能力差异');
|
||
console.log(' → 国际模型对医学文献的理解更准确');
|
||
console.log('\n💡 建议: 优先使用GPT-4o或Claude-4.5进行筛选');
|
||
} else {
|
||
console.log('✅ 结论: 国内模型优于国际模型(差异>10%)');
|
||
console.log(' → 可能是国内模型对中文医学术语理解更好');
|
||
console.log(' → 或者国内模型更符合中国专家的筛选习惯');
|
||
console.log('\n💡 建议: 继续使用国内模型组合');
|
||
}
|
||
|
||
console.log('='.repeat(80) + '\n');
|
||
|
||
// 保存详细报告
|
||
const report = {
|
||
testDate: new Date().toISOString(),
|
||
testCases: testCases.length,
|
||
domesticModels: modelPairs[0],
|
||
internationalModels: modelPairs[1],
|
||
domesticMetrics,
|
||
internationalMetrics,
|
||
domesticResults,
|
||
internationalResults,
|
||
conclusion: {
|
||
accuracyDiff,
|
||
analysis: Math.abs(accuracyDiff) <= 10 ? 'Prompt问题' : (accuracyDiff > 10 ? '国际模型更优' : '国内模型更优')
|
||
}
|
||
};
|
||
|
||
const reportPath = path.join(__dirname, '../docs/国内外模型对比测试报告.json');
|
||
fs.writeFileSync(reportPath, JSON.stringify(report, null, 2), 'utf-8');
|
||
console.log(`📄 详细报告已保存: ${reportPath}\n`);
|
||
}
|
||
|
||
// ========================================
|
||
// 🚀 6. 执行主流程
|
||
// ========================================
|
||
|
||
async function main() {
|
||
console.log('\n🚀 开始国内外模型对比测试\n');
|
||
console.log(`测试样本: ${testCases.length}篇`);
|
||
console.log(`测试组合: 2组`);
|
||
console.log(`预计耗时: ${testCases.length * 2 * 15}秒(约${Math.ceil(testCases.length * 2 * 15 / 60)}分钟)\n`);
|
||
|
||
// 测试国内模型
|
||
const domesticResults = await testModelPair(
|
||
modelPairs[0].name,
|
||
modelPairs[0].model1,
|
||
modelPairs[0].model2,
|
||
testCases
|
||
);
|
||
|
||
// 等待2秒,避免API限流
|
||
console.log('\n⏳ 等待2秒后测试国际模型...\n');
|
||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||
|
||
// 测试国际模型
|
||
const internationalResults = await testModelPair(
|
||
modelPairs[1].name,
|
||
modelPairs[1].model1,
|
||
modelPairs[1].model2,
|
||
testCases
|
||
);
|
||
|
||
// 生成对比报告
|
||
generateComparisonReport(domesticResults, internationalResults);
|
||
|
||
console.log('✅ 测试完成!\n');
|
||
}
|
||
|
||
main().catch(console.error);
|
||
|