AIclinicalresearch/backend/scripts/test-stroke-screening.ts

/**
 * 卒中文献筛选测试脚本
 * 用真实数据验证泛化能力
 */

import XLSX from 'xlsx';
import * as path from 'path';
import { fileURLToPath } from 'url';
import { llmScreeningService } from '../src/modules/asl/services/llmScreeningService.js';

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

// 卒中研究的PICOS（从测试文档读取）
const STROKE_PICOS = {
  population: "非心源性缺血性卒中（NCIS）患者、亚洲人群",
  intervention: "抗血小板治疗药物（阿司匹林、氯吡格雷、奥扎格雷、贝前列素、西洛他唑、替罗非班、替格瑞洛、吲哚布芬、沙格雷酯、氯吡格雷阿司匹林、双嘧达莫等）或抗凝药物（阿加曲班、asundexian、milvexian、华法林、低分子肝素、肝素等）或溶栓药物（链激酶、尿激酶、阿替普酶、替奈普酶等）",
  comparison: "安慰剂或常规治疗",
  outcome: "疗效安全性：卒中进展、神经功能恶化、卒中复发、残疾、死亡、NIHSS评分变化、VTE、痴呆、认知功能减退、疲乏、抑郁等",
  studyDesign: "系统评价（SR）、随机对照试验（RCT）、真实世界研究（RWE）、观察性研究（OBS）"
};

// 纳入标准
const INCLUSION_CRITERIA = `
1. 非心源性缺血性卒中、亚洲患者
2. 卒中后接受二级预防治疗的患者（Secondary Stroke Prevention, SSP）
3. 干预措施为抗血小板、抗凝或溶栓药物
4. 报告疗效或安全性结局（卒中进展、复发、残疾、死亡等）
5. 研究类型：系统评价、RCT、真实世界研究、观察性研究
6. 研究时间：2020年之后的文献
7. 包含"二级预防"或"预防复发"或"卒中预防"相关内容
8. 涉及抗血小板或抗凝药物
`;

// 排除标准
const EXCLUSION_CRITERIA = `
1. 心源性卒中患者、非亚洲人群
2. 其他类型卒中（非缺血性）
3. 用于急性冠脉综合征（ACS）的抗血小板治疗，未明确提及卒中
4. 房颤（AF）患者
5. 混合人群（包含非卒中患者）
6. 病例报告
7. 非中英文文献
8. 仅包含急性期治疗（如急性期溶栓、取栓），未涉及二级预防
`;

interface TestCase {
  index: number;
  pmid: string;
  title: string;
  abstract: string;
  humanDecision: string;  // Include/Exclude
  excludeReason?: string;
}

async function readExcelTestCases(filePath: string, limit: number = 5): Promise<TestCase[]> {
  console.log(`📖 读取Excel文件: ${filePath}`);

  const workbook = XLSX.readFile(filePath);
  const sheetName = workbook.SheetNames[0];
  const worksheet = workbook.Sheets[sheetName];
  const data = XLSX.utils.sheet_to_json(worksheet);

  console.log(`✅ 读取到 ${data.length} 条数据`);

  // 分别提取Included和Excluded的案例（混合测试）
  const includedCases: any[] = [];
  const excludedCases: any[] = [];

  for (const row of data as any[]) {
    // 跳过没有标题或摘要的行
    if (!row['title'] || !row['abstract']) {
      continue;
    }

    if (row['Decision'] && row['Decision'].toLowerCase().includes('include')) {
      includedCases.push(row);
    } else if (row['Decision'] && row['Decision'].toLowerCase().includes('exclude')) {
      excludedCases.push(row);
    }
  }

  console.log(`   - Included案例: ${includedCases.length}条`);
  console.log(`   - Excluded案例: ${excludedCases.length}条`);

  // 混合选择：2个Included + 3个Excluded
  const testCases: TestCase[] = [];

  // 取前2个Included
  for (let i = 0; i < Math.min(2, includedCases.length); i++) {
    const row = includedCases[i];
    testCases.push({
      index: testCases.length + 1,
      pmid: row['key'] || `test-${testCases.length + 1}`,
      title: row['title'] || '',
      abstract: row['abstract'] || '',
      humanDecision: row['Decision'] || 'Unknown',
      excludeReason: row['Reason for excluded'] || undefined
    });
  }

  // 取前3个Excluded
  for (let i = 0; i < Math.min(3, excludedCases.length); i++) {
    const row = excludedCases[i];
    testCases.push({
      index: testCases.length + 1,
      pmid: row['key'] || `test-${testCases.length + 1}`,
      title: row['title'] || '',
      abstract: row['abstract'] || '',
      humanDecision: row['Decision'] || 'Unknown',
      excludeReason: row['Reason for excluded'] || undefined
    });
  }

  console.log(`✅ 提取 ${testCases.length} 条有效测试案例 (${testCases.filter(t => t.humanDecision.toLowerCase().includes('include')).length} Included + ${testCases.filter(t => t.humanDecision.toLowerCase().includes('exclude')).length} Excluded)\n`);
  return testCases;
}

async function testSingleLiterature(
  testCase: TestCase,
  models: [string, string]
): Promise<{
  testCase: TestCase;
  aiDecision: string;
  isCorrect: boolean;
  hasConsensus: boolean;
  details: any;
}> {
  console.log(`\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
  console.log(`[${testCase.index}] PMID: ${testCase.pmid}`);
  console.log(`标题: ${testCase.title.substring(0, 100)}...`);
  console.log(`人类判断: ${testCase.humanDecision}`);

  try {
    const startTime = Date.now();

    const result = await llmScreeningService.dualModelScreening(
      testCase.pmid || `test-${testCase.index}`,
      testCase.title,
      testCase.abstract,
      STROKE_PICOS,
      INCLUSION_CRITERIA,
      EXCLUSION_CRITERIA,
      models
    );

    const duration = Date.now() - startTime;

    // 映射AI决策到Include/Exclude
    let aiDecision = 'Unknown';
    if (result.finalDecision === 'include') {
      aiDecision = 'Include';
    } else if (result.finalDecision === 'exclude') {
      aiDecision = 'Exclude';
    } else {
      aiDecision = 'Uncertain';
    }

    // 标准化比较（处理Included/Include, Excluded/Exclude的差异）
    const normalizeDecision = (decision: string) => {
      const lower = decision.toLowerCase();
      if (lower.includes('include')) return 'include';
      if (lower.includes('exclude')) return 'exclude';
      return lower;
    };

    const isCorrect = normalizeDecision(aiDecision) === normalizeDecision(testCase.humanDecision);

    console.log(`AI判断: ${aiDecision}`);
    console.log(`DeepSeek: ${result.deepseek.conclusion} (置信度: ${result.deepseek.confidence})`);
    console.log(`Qwen: ${result.qwen.conclusion} (置信度: ${result.qwen.confidence})`);
    console.log(`一致性: ${result.hasConflict ? '❌ 冲突' : '✅ 一致'}`);
    console.log(`结果: ${isCorrect ? '✅ 正确' : '❌ 错误'}`);
    console.log(`耗时: ${duration}ms`);

    if (!isCorrect) {
      console.log(`\n❌ 判断错误！`);
      console.log(`期望: ${testCase.humanDecision}`);
      console.log(`实际: ${aiDecision}`);
      if (testCase.excludeReason) {
        console.log(`人类排除理由: ${testCase.excludeReason}`);
      }
      console.log(`DeepSeek理由: ${result.deepseek.reason}`);
      console.log(`Qwen理由: ${result.qwen.reason}`);
    }

    return {
      testCase,
      aiDecision,
      isCorrect,
      hasConsensus: !result.hasConflict,
      details: result
    };

  } catch (error) {
    console.error(`❌ 测试失败:`, error);
    return {
      testCase,
      aiDecision: 'Error',
      isCorrect: false,
      hasConsensus: false,
      details: null
    };
  }
}

async function main() {
  console.log('\n🔬 卒中文献筛选测试');
  console.log('=' .repeat(60));
  console.log('目的: 验证系统对不同研究主题的泛化能力\n');

  // 读取测试数据
  const excelPath = path.join(__dirname, '../docs/03-业务模块/ASL-AI智能文献/05-测试文档/03-测试数据/screening/Test Cases.xlsx');

  let testCases: TestCase[];
  try {
    testCases = await readExcelTestCases(excelPath, 5);
  } catch (error: any) {
    console.error('❌ 读取Excel失败，尝试使用绝对路径...');
    const absolutePath = 'D:\\MyCursor\\AIclinicalresearch\\docs\\03-业务模块\\ASL-AI智能文献\\05-测试文档\\03-测试数据\\screening\\Test Cases.xlsx';
    testCases = await readExcelTestCases(absolutePath, 5);
  }

  if (testCases.length === 0) {
    console.error('❌ 没有读取到有效的测试案例');
    return;
  }

  console.log('📋 PICOS标准:');
  console.log(`P: ${STROKE_PICOS.population}`);
  console.log(`I: ${STROKE_PICOS.intervention.substring(0, 80)}...`);
  console.log(`C: ${STROKE_PICOS.comparison}`);
  console.log(`O: ${STROKE_PICOS.outcome.substring(0, 80)}...`);
  console.log(`S: ${STROKE_PICOS.studyDesign}`);

  console.log('\n🚀 开始测试...');
  console.log(`测试样本数: ${testCases.length}`);
  console.log(`测试模型: DeepSeek-V3 + Qwen-Max\n`);

  const results: any[] = [];

  for (const testCase of testCases) {
    const result = await testSingleLiterature(testCase, ['deepseek-chat', 'qwen-max']);
    results.push(result);

    // 避免API限流
    if (testCases.indexOf(testCase) < testCases.length - 1) {
      await new Promise(resolve => setTimeout(resolve, 2000));
    }
  }

  // 统计结果
  console.log('\n\n' + '='.repeat(60));
  console.log('📊 测试结果统计');
  console.log('='.repeat(60));

  const totalTests = results.length;
  const correctCount = results.filter(r => r.isCorrect).length;
  const consensusCount = results.filter(r => r.hasConsensus).length;
  const accuracy = totalTests > 0 ? (correctCount / totalTests * 100).toFixed(1) : '0.0';
  const consensusRate = totalTests > 0 ? (consensusCount / totalTests * 100).toFixed(1) : '0.0';

  console.log(`\n总测试数: ${totalTests}`);
  console.log(`正确判断: ${correctCount}`);
  console.log(`准确率: ${accuracy}% ${parseFloat(accuracy) >= 85 ? '✅' : '❌'} (目标≥85%)`);
  console.log(`双模型一致率: ${consensusRate}% ${parseFloat(consensusRate) >= 80 ? '✅' : '❌'} (目标≥80%)`);

  console.log('\n📋 详细结果:');
  results.forEach((r, i) => {
    console.log(`${i + 1}. ${r.isCorrect ? '✅' : '❌'} PMID:${r.testCase.pmid} - 期望:${r.testCase.humanDecision}, AI:${r.aiDecision}`);
  });

  // 结论
  console.log('\n' + '='.repeat(60));
  console.log('🎯 结论');
  console.log('='.repeat(60));

  if (parseFloat(accuracy) >= 85) {
    console.log('✅ 测试通过！系统对卒中研究的筛选准确率达标！');
    console.log('📝 建议: 可以继续开发PICOS配置界面，实现MVP。');
  } else if (parseFloat(accuracy) >= 60) {
    console.log('⚠️  准确率中等。系统有一定泛化能力，但需要优化。');
    console.log('📝 建议: 分析错误案例，优化Prompt模板。');
  } else {
    console.log('❌ 准确率较低。当前Prompt对卒中研究泛化能力不足。');
    console.log('📝 建议: 需要重新设计Prompt策略，或考虑用户自定义方案。');
  }

  console.log('='.repeat(60) + '\n');
}

main().catch(console.error);