refactor(asl): ASL frontend architecture refactoring with left navigation

- feat: Create ASLLayout component with 7-module left navigation - feat: Implement Title Screening Settings page with optimized PICOS layout - feat: Add placeholder pages for Workbench and Results - fix: Fix nested routing structure for React Router v6 - fix: Resolve Spin component warning in MainLayout - fix: Add QueryClientProvider to App.tsx - style: Optimize PICOS form layout (P+I left, C+O+S right) - style: Align Inclusion/Exclusion criteria side-by-side - docs: Add architecture refactoring and routing fix reports Ref: Week 2 Frontend Development Scope: ASL module MVP - Title Abstract Screening
2025-11-18 21:51:51 +08:00
parent e3e7e028e8
commit 3634933ece
213 changed files with 20054 additions and 442 deletions
--- a/backend/scripts/test-stroke-screening-international-models.ts
+++ b/backend/scripts/test-stroke-screening-international-models.ts
@@ -0,0 +1,348 @@
+/**
+ * 卒中数据测试 - 国际模型对比
+ * 
+ * 目的：对比国内模型（DeepSeek+Qwen）vs 国际模型（GPT-4o+Claude）
+ * 
+ * 测试假设：
+ * 1. 如果国际模型准确率更高 → 是模型能力问题
+ * 2. 如果国际模型准确率相似 → 是Prompt或理解差异问题
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import * as XLSX from 'xlsx';
+import { fileURLToPath } from 'url';
+import { llmScreeningService } from '../src/modules/asl/services/llmScreeningService.js';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+// ========================================
+// 📋 1. 读取PICOS和标准
+// ========================================
+
+console.log('📖 正在读取PICOS和纳排标准...\n');
+
+const picosPath = path.join(
+  __dirname,
+  '../../docs/03-业务模块/ASL-AI智能文献/05-测试文档/03-测试数据/screening/测试案例的PICOS、纳入标准、排除标准.txt'
+);
+
+const picosContent = fs.readFileSync(picosPath, 'utf-8');
+
+// 解析PICOS（简化版）
+const picoCriteria = {
+  population: '非心源性缺血性卒中患者、亚洲人群',
+  intervention: '抗血小板药物/抗凝药物/溶栓药物（阿司匹林、氯吡格雷、替格瑞洛、达比加群等）',
+  comparison: '安慰剂或常规治疗',
+  outcome: '卒中进展、复发、残疾程度、死亡率、出血事件等',
+  studyDesign: 'SR、RCT、RWE、OBS'
+};
+
+const inclusionCriteria = `
+1. 研究对象为非心源性缺血性卒中患者
+2. 研究人群为亚洲人群（优先）
+3. 干预措施为抗血小板/抗凝/溶栓药物
+4. 对照组为安慰剂或常规治疗
+5. 研究时间在2020年之后
+6. 研究设计为SR、RCT、RWE、OBS
+`;
+
+const exclusionCriteria = `
+1. 综述、病例报告、会议摘要
+2. 动物实验、体外实验
+3. 研究人群非亚洲人群（除非有特殊价值）
+4. 研究时间在2020年之前
+5. 心源性卒中或出血性卒中
+`;
+
+console.log('✅ PICOS标准已加载\n');
+
+// ========================================
+// 📋 2. 读取测试案例
+// ========================================
+
+console.log('📖 正在读取测试案例...\n');
+
+const excelPath = path.join(
+  __dirname,
+  '../../docs/03-业务模块/ASL-AI智能文献/05-测试文档/03-测试数据/screening/Test Cases.xlsx'
+);
+
+const workbook = XLSX.read(fs.readFileSync(excelPath), { type: 'buffer' });
+const sheetName = workbook.SheetNames[0];
+const worksheet = workbook.Sheets[sheetName];
+const data = XLSX.utils.sheet_to_json(worksheet);
+
+console.log(`✅ 读取到 ${data.length} 条数据\n`);
+
+// 选择测试样本：2个Included + 3个Excluded
+const includedCases = data.filter((row: any) => 
+  row['Decision']?.toString().toLowerCase().includes('include')
+).slice(0, 2);
+
+const excludedCases = data.filter((row: any) => 
+  row['Decision']?.toString().toLowerCase().includes('exclude')
+).slice(0, 3);
+
+const testCases = [...includedCases, ...excludedCases];
+
+console.log(`✅ 选择测试样本: ${testCases.length}篇（2 Included + 3 Excluded）\n`);
+
+// ========================================
+// 🧪 3. 定义测试模型组合
+// ========================================
+
+const modelPairs = [
+  {
+    name: '国内模型组合',
+    model1: 'deepseek-chat',
+    model2: 'qwen3-72b',
+    description: 'DeepSeek-V3 + Qwen3-Max（当前使用）'
+  },
+  {
+    name: '国际模型组合',
+    model1: 'gpt-4o',
+    model2: 'claude-sonnet-4.5',
+    description: 'GPT-4o + Claude-4.5（国际顶级模型）'
+  }
+];
+
+// ========================================
+// 🧪 4. 执行测试
+// ========================================
+
+interface TestResult {
+  caseIndex: number;
+  title: string;
+  humanDecision: string;
+  aiDecision: string;
+  model1Result: any;
+  model2Result: any;
+  isCorrect: boolean;
+  hasConflict: boolean;
+  processingTime: number;
+}
+
+async function testModelPair(
+  pairName: string,
+  model1: string,
+  model2: string,
+  cases: any[]
+): Promise<TestResult[]> {
+  console.log(`\n${'='.repeat(60)}`);
+  console.log(`🧪 测试模型组合: ${pairName}`);
+  console.log(`${'='.repeat(60)}\n`);
+
+  const results: TestResult[] = [];
+
+  for (let i = 0; i < cases.length; i++) {
+    const testCase = cases[i];
+    const title = testCase['title'] || '';
+    const abstract = testCase['abstract'] || '';
+    const humanDecision = testCase['Decision'] || '';
+
+    console.log(`\n[${i + 1}/${cases.length}] 正在筛选...`);
+    console.log(`标题: ${title.substring(0, 60)}...`);
+    console.log(`人类决策: ${humanDecision}`);
+
+    const startTime = Date.now();
+
+    try {
+      const screeningResult = await llmScreeningService.dualModelScreening(
+        `test-case-${i + 1}`,  // literatureId
+        title,
+        abstract,
+        picoCriteria,
+        inclusionCriteria,
+        exclusionCriteria,
+        [model1, model2],  // models参数应该是一个数组
+        'standard'  // style参数
+      );
+
+      const processingTime = Date.now() - startTime;
+
+      // 标准化决策
+      const normalizedHuman = humanDecision.toLowerCase().includes('include') ? 'include' : 'exclude';
+      const normalizedAI = screeningResult.finalDecision === 'pending' ? 'uncertain' : screeningResult.finalDecision;
+      
+      const isCorrect = normalizedAI === normalizedHuman;
+
+      console.log(`AI决策: ${screeningResult.finalDecision} ${isCorrect ? '✅' : '❌'}`);
+      console.log(`模型一致: ${!screeningResult.hasConflict ? '✅' : '❌'}`);
+      console.log(`处理时间: ${(processingTime / 1000).toFixed(2)}秒`);
+
+      results.push({
+        caseIndex: i + 1,
+        title: title.substring(0, 100),
+        humanDecision: normalizedHuman,
+        aiDecision: normalizedAI,
+        model1Result: screeningResult.model1Result,
+        model2Result: screeningResult.model2Result,
+        isCorrect,
+        hasConflict: screeningResult.hasConflict,
+        processingTime
+      });
+
+    } catch (error: any) {
+      console.error(`❌ 筛选失败: ${error.message}`);
+      results.push({
+        caseIndex: i + 1,
+        title: title.substring(0, 100),
+        humanDecision: humanDecision.toLowerCase().includes('include') ? 'include' : 'exclude',
+        aiDecision: 'error',
+        model1Result: null,
+        model2Result: null,
+        isCorrect: false,
+        hasConflict: false,
+        processingTime: Date.now() - startTime
+      });
+    }
+  }
+
+  return results;
+}
+
+// ========================================
+// 📊 5. 生成对比报告
+// ========================================
+
+function generateComparisonReport(
+  domesticResults: TestResult[],
+  internationalResults: TestResult[]
+) {
+  console.log(`\n${'='.repeat(80)}`);
+  console.log(`📊 国内 vs 国际模型对比报告`);
+  console.log(`${'='.repeat(80)}\n`);
+
+  // 计算指标
+  function calculateMetrics(results: TestResult[]) {
+    const total = results.length;
+    const correct = results.filter(r => r.isCorrect).length;
+    const consistent = results.filter(r => !r.hasConflict).length;
+    const avgTime = results.reduce((sum, r) => sum + r.processingTime, 0) / total;
+
+    return {
+      accuracy: (correct / total * 100).toFixed(1),
+      consistency: (consistent / total * 100).toFixed(1),
+      avgTime: (avgTime / 1000).toFixed(2),
+      correct,
+      total
+    };
+  }
+
+  const domesticMetrics = calculateMetrics(domesticResults);
+  const internationalMetrics = calculateMetrics(internationalResults);
+
+  // 对比表格
+  console.log('| 指标 | 国内模型 | 国际模型 | 差异 |');
+  console.log('|------|----------|----------|------|');
+  console.log(`| 准确率 | ${domesticMetrics.accuracy}% (${domesticMetrics.correct}/${domesticMetrics.total}) | ${internationalMetrics.accuracy}% (${internationalMetrics.correct}/${internationalMetrics.total}) | ${(parseFloat(internationalMetrics.accuracy) - parseFloat(domesticMetrics.accuracy)).toFixed(1)}% |`);
+  console.log(`| 一致率 | ${domesticMetrics.consistency}% | ${internationalMetrics.consistency}% | ${(parseFloat(internationalMetrics.consistency) - parseFloat(domesticMetrics.consistency)).toFixed(1)}% |`);
+  console.log(`| 平均耗时 | ${domesticMetrics.avgTime}秒 | ${internationalMetrics.avgTime}秒 | ${(parseFloat(internationalMetrics.avgTime) - parseFloat(domesticMetrics.avgTime)).toFixed(2)}秒 |`);
+
+  console.log('\n');
+
+  // 逐案例对比
+  console.log('📋 逐案例对比:\n');
+  for (let i = 0; i < domesticResults.length; i++) {
+    const domestic = domesticResults[i];
+    const international = internationalResults[i];
+
+    console.log(`[案例 ${i + 1}] ${domestic.title}`);
+    console.log(`  人类: ${domestic.humanDecision}`);
+    console.log(`  国内模型: ${domestic.aiDecision} ${domestic.isCorrect ? '✅' : '❌'}`);
+    console.log(`  国际模型: ${international.aiDecision} ${international.isCorrect ? '✅' : '❌'}`);
+    
+    if (domestic.aiDecision !== international.aiDecision) {
+      console.log(`  ⚠️ 两组模型判断不一致！`);
+    }
+    console.log('');
+  }
+
+  // 结论分析
+  console.log('\n' + '='.repeat(80));
+  console.log('🎯 结论分析\n');
+
+  const accuracyDiff = parseFloat(internationalMetrics.accuracy) - parseFloat(domesticMetrics.accuracy);
+
+  if (Math.abs(accuracyDiff) <= 10) {
+    console.log('✅ 结论: 国内外模型准确率相近（差异≤10%）');
+    console.log('   → 问题不在模型能力，而在于：');
+    console.log('     1. Prompt设计（可能过于严格）');
+    console.log('     2. AI vs 人类对"匹配"的理解差异');
+    console.log('     3. 纳排标准本身存在歧义');
+    console.log('\n💡 建议: 优化Prompt策略，增加宽松/标准/严格三种模式');
+  } else if (accuracyDiff > 10) {
+    console.log('✅ 结论: 国际模型显著优于国内模型（差异>10%）');
+    console.log('   → 问题在于模型能力差异');
+    console.log('   → 国际模型对医学文献的理解更准确');
+    console.log('\n💡 建议: 优先使用GPT-4o或Claude-4.5进行筛选');
+  } else {
+    console.log('✅ 结论: 国内模型优于国际模型（差异>10%）');
+    console.log('   → 可能是国内模型对中文医学术语理解更好');
+    console.log('   → 或者国内模型更符合中国专家的筛选习惯');
+    console.log('\n💡 建议: 继续使用国内模型组合');
+  }
+
+  console.log('='.repeat(80) + '\n');
+
+  // 保存详细报告
+  const report = {
+    testDate: new Date().toISOString(),
+    testCases: testCases.length,
+    domesticModels: modelPairs[0],
+    internationalModels: modelPairs[1],
+    domesticMetrics,
+    internationalMetrics,
+    domesticResults,
+    internationalResults,
+    conclusion: {
+      accuracyDiff,
+      analysis: Math.abs(accuracyDiff) <= 10 ? 'Prompt问题' : (accuracyDiff > 10 ? '国际模型更优' : '国内模型更优')
+    }
+  };
+
+  const reportPath = path.join(__dirname, '../docs/国内外模型对比测试报告.json');
+  fs.writeFileSync(reportPath, JSON.stringify(report, null, 2), 'utf-8');
+  console.log(`📄 详细报告已保存: ${reportPath}\n`);
+}
+
+// ========================================
+// 🚀 6. 执行主流程
+// ========================================
+
+async function main() {
+  console.log('\n🚀 开始国内外模型对比测试\n');
+  console.log(`测试样本: ${testCases.length}篇`);
+  console.log(`测试组合: 2组`);
+  console.log(`预计耗时: ${testCases.length * 2 * 15}秒（约${Math.ceil(testCases.length * 2 * 15 / 60)}分钟）\n`);
+
+  // 测试国内模型
+  const domesticResults = await testModelPair(
+    modelPairs[0].name,
+    modelPairs[0].model1,
+    modelPairs[0].model2,
+    testCases
+  );
+
+  // 等待2秒，避免API限流
+  console.log('\n⏳ 等待2秒后测试国际模型...\n');
+  await new Promise(resolve => setTimeout(resolve, 2000));
+
+  // 测试国际模型
+  const internationalResults = await testModelPair(
+    modelPairs[1].name,
+    modelPairs[1].model1,
+    modelPairs[1].model2,
+    testCases
+  );
+
+  // 生成对比报告
+  generateComparisonReport(domesticResults, internationalResults);
+
+  console.log('✅ 测试完成！\n');
+}
+
+main().catch(console.error);
+