diff --git a/backend/prisma/migrations/20260225_add_extraction_template_engine/migration.sql b/backend/prisma/migrations/20260225_add_extraction_template_engine/migration.sql new file mode 100644 index 00000000..4d39cb31 --- /dev/null +++ b/backend/prisma/migrations/20260225_add_extraction_template_engine/migration.sql @@ -0,0 +1,91 @@ +-- ASL Tool 3: Full-text Smart Extraction Workbench V2.0 +-- Architecture: Scatter-dispatch + Independent Worker + Aggregator polling reconciliation +-- 4 new tables in asl_schema + +-- CreateTable: System extraction templates (RCT / Cohort / QC) +CREATE TABLE "asl_schema"."extraction_templates" ( + "id" TEXT NOT NULL, + "code" TEXT NOT NULL, + "name" TEXT NOT NULL, + "description" TEXT, + "baseFields" JSONB NOT NULL, + "is_system" BOOLEAN NOT NULL DEFAULT true, + "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, + "updated_at" TIMESTAMP(3) NOT NULL, + + CONSTRAINT "extraction_templates_pkey" PRIMARY KEY ("id") +); + +-- CreateTable: Project-level templates (cloned from system + custom fields) +CREATE TABLE "asl_schema"."extraction_project_templates" ( + "id" TEXT NOT NULL, + "project_id" TEXT NOT NULL, + "user_id" TEXT NOT NULL, + "base_template_id" TEXT NOT NULL, + "outcome_type" TEXT NOT NULL DEFAULT 'survival', + "custom_fields" JSONB NOT NULL DEFAULT '[]', + "is_locked" BOOLEAN NOT NULL DEFAULT false, + "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, + "updated_at" TIMESTAMP(3) NOT NULL, + + CONSTRAINT "extraction_project_templates_pkey" PRIMARY KEY ("id") +); + +-- CreateTable: Extraction tasks (1 task = batch extract N documents) +CREATE TABLE "asl_schema"."extraction_tasks" ( + "id" TEXT NOT NULL, + "project_id" TEXT NOT NULL, + "user_id" TEXT NOT NULL, + "project_template_id" TEXT NOT NULL, + "pkb_knowledge_base_id" TEXT NOT NULL, + "idempotency_key" TEXT, + "total_count" INTEGER NOT NULL, + "status" TEXT NOT NULL DEFAULT 'processing', + "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, + "updated_at" TIMESTAMP(3) NOT NULL, + "completed_at" TIMESTAMP(3), + + CONSTRAINT "extraction_tasks_pkey" PRIMARY KEY ("id") +); + +-- CreateTable: Per-document extraction results (Worker only writes its own row) +CREATE TABLE "asl_schema"."extraction_results" ( + "id" TEXT NOT NULL, + "task_id" TEXT NOT NULL, + "project_id" TEXT NOT NULL, + "pkb_document_id" TEXT NOT NULL, + "snapshot_storage_key" TEXT NOT NULL, + "snapshot_filename" TEXT NOT NULL, + "status" TEXT NOT NULL DEFAULT 'pending', + "extracted_data" JSONB, + "quote_verification" JSONB, + "manual_overrides" JSONB, + "review_status" TEXT NOT NULL DEFAULT 'pending', + "reviewed_at" TIMESTAMP(3), + "error_message" TEXT, + "processed_at" TIMESTAMP(3), + "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, + "updated_at" TIMESTAMP(3) NOT NULL, + + CONSTRAINT "extraction_results_pkey" PRIMARY KEY ("id") +); + +-- Unique indexes +CREATE UNIQUE INDEX "extraction_templates_code_key" ON "asl_schema"."extraction_templates"("code"); +CREATE UNIQUE INDEX "extraction_tasks_idempotency_key_key" ON "asl_schema"."extraction_tasks"("idempotency_key"); +CREATE UNIQUE INDEX "unique_extraction_project_base_template" ON "asl_schema"."extraction_project_templates"("project_id", "base_template_id"); + +-- Performance indexes +CREATE INDEX "idx_extraction_project_templates_project_id" ON "asl_schema"."extraction_project_templates"("project_id"); +CREATE INDEX "idx_extraction_project_templates_user_id" ON "asl_schema"."extraction_project_templates"("user_id"); +CREATE INDEX "idx_extraction_tasks_project_id" ON "asl_schema"."extraction_tasks"("project_id"); +CREATE INDEX "idx_extraction_tasks_user_id" ON "asl_schema"."extraction_tasks"("user_id"); +CREATE INDEX "idx_extraction_tasks_status" ON "asl_schema"."extraction_tasks"("status"); +CREATE INDEX "idx_extraction_results_task_status" ON "asl_schema"."extraction_results"("task_id", "status"); +CREATE INDEX "idx_extraction_results_task_id" ON "asl_schema"."extraction_results"("task_id"); +CREATE INDEX "idx_extraction_results_project_id" ON "asl_schema"."extraction_results"("project_id"); + +-- Foreign keys +ALTER TABLE "asl_schema"."extraction_project_templates" ADD CONSTRAINT "extraction_project_templates_base_template_id_fkey" FOREIGN KEY ("base_template_id") REFERENCES "asl_schema"."extraction_templates"("id") ON DELETE RESTRICT ON UPDATE CASCADE; +ALTER TABLE "asl_schema"."extraction_tasks" ADD CONSTRAINT "extraction_tasks_project_template_id_fkey" FOREIGN KEY ("project_template_id") REFERENCES "asl_schema"."extraction_project_templates"("id") ON DELETE RESTRICT ON UPDATE CASCADE; +ALTER TABLE "asl_schema"."extraction_results" ADD CONSTRAINT "extraction_results_task_id_fkey" FOREIGN KEY ("task_id") REFERENCES "asl_schema"."extraction_tasks"("id") ON DELETE CASCADE ON UPDATE CASCADE; diff --git a/backend/prisma/schema.prisma b/backend/prisma/schema.prisma index 88dbe4c1..24b3003c 100644 --- a/backend/prisma/schema.prisma +++ b/backend/prisma/schema.prisma @@ -590,6 +590,102 @@ model AslFulltextScreeningResult { @@schema("asl_schema") } +// ═══════════════════════════════════════════════════════════════ +// ASL 工具 3:全文智能提取工作台 V2.0 +// 架构:散装派发 + 独立 Worker + Aggregator 轮询收口 +// ═══════════════════════════════════════════════════════════════ + +/// 系统内置提取模板(RCT / Cohort / QC),管理员维护,用户只读 +model AslExtractionTemplate { + id String @id @default(uuid()) + code String @unique // RCT / Cohort / QC + name String // 随机对照试验 / 队列研究 / 质量改进 + description String? + baseFields Json // { metadata: [...], baseline: [...], rob: [...], outcomes_survival: [...], ... } + isSystem Boolean @default(true) @map("is_system") + createdAt DateTime @default(now()) @map("created_at") + updatedAt DateTime @updatedAt @map("updated_at") + + projectTemplates AslProjectTemplate[] @relation("BaseTemplateProjectTemplates") + + @@map("extraction_templates") + @@schema("asl_schema") +} + +/// 项目级模板(克隆自系统模板 + 用户自定义字段插槽,M3 启用自定义字段) +model AslProjectTemplate { + id String @id @default(uuid()) + projectId String @map("project_id") + userId String @map("user_id") + baseTemplateId String @map("base_template_id") + outcomeType String @default("survival") @map("outcome_type") // survival | dichotomous | continuous + customFields Json @default("[]") @map("custom_fields") // M3: [{name, type, prompt}] + isLocked Boolean @default(false) @map("is_locked") + createdAt DateTime @default(now()) @map("created_at") + updatedAt DateTime @updatedAt @map("updated_at") + + baseTemplate AslExtractionTemplate @relation("BaseTemplateProjectTemplates", fields: [baseTemplateId], references: [id]) + tasks AslExtractionTask[] @relation("TemplateExtractionTasks") + + @@unique([projectId, baseTemplateId], map: "unique_extraction_project_base_template") + @@index([projectId], map: "idx_extraction_project_templates_project_id") + @@index([userId], map: "idx_extraction_project_templates_user_id") + @@map("extraction_project_templates") + @@schema("asl_schema") +} + +/// 提取任务(1 个任务 = 批量提取 N 篇文献),状态仅由 Aggregator 修改 +model AslExtractionTask { + id String @id @default(uuid()) + projectId String @map("project_id") + userId String @map("user_id") + projectTemplateId String @map("project_template_id") + pkbKnowledgeBaseId String @map("pkb_knowledge_base_id") + idempotencyKey String? @unique @map("idempotency_key") + totalCount Int @map("total_count") + status String @default("processing") // processing | completed | failed + createdAt DateTime @default(now()) @map("created_at") + updatedAt DateTime @updatedAt @map("updated_at") + completedAt DateTime? @map("completed_at") + + projectTemplate AslProjectTemplate @relation("TemplateExtractionTasks", fields: [projectTemplateId], references: [id]) + results AslExtractionResult[] @relation("TaskExtractionResults") + + @@index([projectId], map: "idx_extraction_tasks_project_id") + @@index([userId], map: "idx_extraction_tasks_user_id") + @@index([status], map: "idx_extraction_tasks_status") + @@map("extraction_tasks") + @@schema("asl_schema") +} + +/// 单篇文献提取结果,Worker 只写自己的 Result 行,绝不碰 Task 表 +model AslExtractionResult { + id String @id @default(uuid()) + taskId String @map("task_id") + projectId String @map("project_id") + pkbDocumentId String @map("pkb_document_id") + snapshotStorageKey String @map("snapshot_storage_key") // API 层冻结的 PKB OSS 路径 + snapshotFilename String @map("snapshot_filename") // API 层冻结的原始文件名 + status String @default("pending") // pending | extracting | completed | error + extractedData Json? @map("extracted_data") // LLM 结构化提取 JSON + quoteVerification Json? @map("quote_verification") // fuzzyQuoteMatch 三级置信度结果 + manualOverrides Json? @map("manual_overrides") // HITL 人工修改记录(M2) + reviewStatus String @default("pending") @map("review_status") // pending | approved + reviewedAt DateTime? @map("reviewed_at") + errorMessage String? @map("error_message") + processedAt DateTime? @map("processed_at") + createdAt DateTime @default(now()) @map("created_at") + updatedAt DateTime @updatedAt @map("updated_at") + + task AslExtractionTask @relation("TaskExtractionResults", fields: [taskId], references: [id], onDelete: Cascade) + + @@index([taskId, status], map: "idx_extraction_results_task_status") // Aggregator groupBy 性能保障 + @@index([taskId], map: "idx_extraction_results_task_id") + @@index([projectId], map: "idx_extraction_results_project_id") + @@map("extraction_results") + @@schema("asl_schema") +} + model DCHealthCheck { id String @id @default(uuid()) userId String @map("user_id") diff --git a/backend/prisma/seed-extraction-templates.ts b/backend/prisma/seed-extraction-templates.ts new file mode 100644 index 00000000..d67f8ec5 --- /dev/null +++ b/backend/prisma/seed-extraction-templates.ts @@ -0,0 +1,182 @@ +import { PrismaClient } from '@prisma/client'; + +const prisma = new PrismaClient(); + +/** + * ASL 工具 3 系统内置提取模板 Seed + * 3 套模板:RCT / Cohort / QC + * 字段定义来自《ASL 工具 3 全文提取数据字典与规范》 + */ +const SYSTEM_TEMPLATES = [ + { + code: 'RCT', + name: '随机对照试验 (RCT)', + description: '适用于随机对照试验文献的结构化数据提取,包含基线特征、RoB 2.0 偏倚风险评估和多种结局指标类型', + baseFields: { + metadata: [ + { key: 'study_id', label: '研究标识', type: 'string', description: '第一作者+年份,如 Gandhi 2018' }, + { key: 'nct_number', label: '临床试验注册号', type: 'string', description: 'ClinicalTrials.gov 注册号' }, + { key: 'study_design', label: '研究设计类型', type: 'string', description: '如 RCT, Phase III RCT' }, + { key: 'funding_source', label: '资金来源', type: 'string', description: '资助方与利益冲突声明' }, + ], + baseline: [ + { key: 'treatment_name', label: '实验组干预', type: 'string', description: '含剂量/频次' }, + { key: 'control_name', label: '对照组干预', type: 'string', description: '如 Placebo' }, + { key: 'n_treatment', label: '实验组样本量', type: 'integer', description: 'Table 1 中 N=xxx' }, + { key: 'n_control', label: '对照组样本量', type: 'integer', description: 'Table 1 中 N=xxx' }, + { key: 'age_treatment', label: '实验组年龄', type: 'string', description: 'Mean±SD 或 Median(IQR)' }, + { key: 'age_control', label: '对照组年龄', type: 'string', description: 'Mean±SD 或 Median(IQR)' }, + { key: 'male_percent', label: '男性比例(%)', type: 'string', description: '整体或分组' }, + ], + rob: [ + { key: 'rob_randomization', label: '随机序列产生', type: 'string', description: 'Low/High/Unclear Risk' }, + { key: 'rob_allocation', label: '分配隐藏', type: 'string', description: 'Low/High/Unclear Risk' }, + { key: 'rob_blinding', label: '盲法实施', type: 'string', description: 'Low/High/Unclear Risk' }, + { key: 'rob_attrition', label: '失访与数据完整性', type: 'string', description: 'Low/High/Unclear Risk' }, + ], + outcomes_survival: [ + { key: 'endpoint_name', label: '终点名称', type: 'string', description: '如 OS, PFS, MACE' }, + { key: 'hr_value', label: '风险比 (HR)', type: 'number', description: 'Hazard Ratio' }, + { key: 'hr_ci_lower', label: '95%CI 下限', type: 'number', description: '' }, + { key: 'hr_ci_upper', label: '95%CI 上限', type: 'number', description: '' }, + { key: 'p_value', label: 'P 值', type: 'string', description: '如 <0.001 或 0.032' }, + ], + outcomes_dichotomous: [ + { key: 'event_treatment', label: '实验组事件数', type: 'integer', description: '发生事件的具体人数' }, + { key: 'total_treatment', label: '实验组分析总人数', type: 'integer', description: '可能与基线总人数不同' }, + { key: 'event_control', label: '对照组事件数', type: 'integer', description: '' }, + { key: 'total_control', label: '对照组分析总人数', type: 'integer', description: '' }, + ], + outcomes_continuous: [ + { key: 'mean_treatment', label: '实验组均值', type: 'number', description: '' }, + { key: 'sd_treatment', label: '实验组标准差', type: 'number', description: 'SD,若原文为 SE/CI 需换算' }, + { key: 'n_treatment_outcome', label: '实验组分析人数', type: 'integer', description: '' }, + { key: 'mean_control', label: '对照组均值', type: 'number', description: '' }, + { key: 'sd_control', label: '对照组标准差', type: 'number', description: '' }, + { key: 'n_control_outcome', label: '对照组分析人数', type: 'integer', description: '' }, + ], + }, + }, + { + code: 'Cohort', + name: '队列研究 (Cohort)', + description: '适用于前瞻性/回顾性队列研究,基线特征与 RCT 类似但无随机化相关偏倚评估', + baseFields: { + metadata: [ + { key: 'study_id', label: '研究标识', type: 'string', description: '第一作者+年份' }, + { key: 'study_design', label: '研究设计类型', type: 'string', description: '如 Prospective Cohort, Retrospective Cohort' }, + { key: 'funding_source', label: '资金来源', type: 'string', description: '' }, + { key: 'follow_up_duration', label: '随访时长', type: 'string', description: '如 Median 5.2 years' }, + ], + baseline: [ + { key: 'exposure_group', label: '暴露组', type: 'string', description: '暴露因素描述' }, + { key: 'control_group', label: '非暴露组/对照组', type: 'string', description: '' }, + { key: 'n_exposure', label: '暴露组样本量', type: 'integer', description: '' }, + { key: 'n_control', label: '对照组样本量', type: 'integer', description: '' }, + { key: 'age_exposure', label: '暴露组年龄', type: 'string', description: '' }, + { key: 'age_control', label: '对照组年龄', type: 'string', description: '' }, + { key: 'male_percent', label: '男性比例(%)', type: 'string', description: '' }, + ], + rob: [ + { key: 'rob_selection', label: '选择偏倚', type: 'string', description: 'NOS: 代表性、非暴露组选择、暴露确定' }, + { key: 'rob_comparability', label: '可比性', type: 'string', description: 'NOS: 混杂因素控制' }, + { key: 'rob_outcome', label: '结局评估', type: 'string', description: 'NOS: 结局评估、随访充分性' }, + ], + outcomes_survival: [ + { key: 'endpoint_name', label: '终点名称', type: 'string', description: '' }, + { key: 'hr_value', label: '风险比 (HR)', type: 'number', description: '调整后 HR' }, + { key: 'hr_ci_lower', label: '95%CI 下限', type: 'number', description: '' }, + { key: 'hr_ci_upper', label: '95%CI 上限', type: 'number', description: '' }, + { key: 'p_value', label: 'P 值', type: 'string', description: '' }, + ], + outcomes_dichotomous: [ + { key: 'event_treatment', label: '暴露组事件数', type: 'integer', description: '' }, + { key: 'total_treatment', label: '暴露组总人数', type: 'integer', description: '' }, + { key: 'event_control', label: '对照组事件数', type: 'integer', description: '' }, + { key: 'total_control', label: '对照组总人数', type: 'integer', description: '' }, + ], + outcomes_continuous: [ + { key: 'mean_treatment', label: '暴露组均值', type: 'number', description: '' }, + { key: 'sd_treatment', label: '暴露组标准差', type: 'number', description: '' }, + { key: 'n_treatment_outcome', label: '暴露组分析人数', type: 'integer', description: '' }, + { key: 'mean_control', label: '对照组均值', type: 'number', description: '' }, + { key: 'sd_control', label: '对照组标准差', type: 'number', description: '' }, + { key: 'n_control_outcome', label: '对照组分析人数', type: 'integer', description: '' }, + ], + }, + }, + { + code: 'QC', + name: '质量改进研究 (QI/QC)', + description: '适用于质量改进研究,关注干预前后的指标变化,偏倚评估采用 ROBINS-I 简化版', + baseFields: { + metadata: [ + { key: 'study_id', label: '研究标识', type: 'string', description: '第一作者+年份' }, + { key: 'study_design', label: '研究设计类型', type: 'string', description: '如 Before-After, ITS' }, + { key: 'setting', label: '研究场景', type: 'string', description: '如 ICU, Emergency Department' }, + { key: 'funding_source', label: '资金来源', type: 'string', description: '' }, + ], + baseline: [ + { key: 'intervention_name', label: 'QI 干预措施', type: 'string', description: '质量改进措施描述' }, + { key: 'comparator', label: '对照/基线', type: 'string', description: '如 Pre-intervention period' }, + { key: 'n_intervention', label: '干预组样本量', type: 'integer', description: '' }, + { key: 'n_comparator', label: '对照组样本量', type: 'integer', description: '' }, + { key: 'duration_pre', label: '干预前观察期', type: 'string', description: '' }, + { key: 'duration_post', label: '干预后观察期', type: 'string', description: '' }, + ], + rob: [ + { key: 'rob_confounding', label: '混杂偏倚', type: 'string', description: 'ROBINS-I 简化' }, + { key: 'rob_measurement', label: '测量偏倚', type: 'string', description: '结局指标测量方法是否一致' }, + { key: 'rob_reporting', label: '报告偏倚', type: 'string', description: '是否选择性报告' }, + ], + outcomes_dichotomous: [ + { key: 'event_treatment', label: '干预后事件数', type: 'integer', description: '' }, + { key: 'total_treatment', label: '干预后总人数', type: 'integer', description: '' }, + { key: 'event_control', label: '干预前事件数', type: 'integer', description: '' }, + { key: 'total_control', label: '干预前总人数', type: 'integer', description: '' }, + ], + outcomes_continuous: [ + { key: 'mean_treatment', label: '干预后均值', type: 'number', description: '' }, + { key: 'sd_treatment', label: '干预后标准差', type: 'number', description: '' }, + { key: 'n_treatment_outcome', label: '干预后分析人数', type: 'integer', description: '' }, + { key: 'mean_control', label: '干预前均值', type: 'number', description: '' }, + { key: 'sd_control', label: '干预前标准差', type: 'number', description: '' }, + { key: 'n_control_outcome', label: '干预前分析人数', type: 'integer', description: '' }, + ], + }, + }, +]; + +async function main() { + console.log('🌱 ASL 工具 3:注入系统内置提取模板...\n'); + + for (const template of SYSTEM_TEMPLATES) { + const result = await prisma.aslExtractionTemplate.upsert({ + where: { code: template.code }, + update: { + name: template.name, + description: template.description, + baseFields: template.baseFields, + }, + create: { + code: template.code, + name: template.name, + description: template.description, + baseFields: template.baseFields, + isSystem: true, + }, + }); + console.log(` ✅ ${result.code}: ${result.name}`); + } + + const count = await prisma.aslExtractionTemplate.count(); + console.log(`\n🎉 完成!共 ${count} 套系统模板。`); +} + +main() + .then(() => prisma.$disconnect()) + .catch(async (e) => { + console.error('❌ Seed 失败:', e); + await prisma.$disconnect(); + process.exit(1); + }); diff --git a/backend/src/common/jobs/PgBossQueue.ts b/backend/src/common/jobs/PgBossQueue.ts index f6429c4b..0c8369b8 100644 --- a/backend/src/common/jobs/PgBossQueue.ts +++ b/backend/src/common/jobs/PgBossQueue.ts @@ -43,6 +43,14 @@ import { logger } from '../logging/index.js' export class PgBossQueue implements JobQueue { private boss: PgBoss private jobs: Map = new Map() // 任务元数据缓存 + + /** + * 暴露 pg-boss 原生实例,供 Level 2 散装派发模式直接使用。 + * Level 1 单体任务继续使用 push/process; + * Level 2 批量任务(如 ASL 工具 3)通过此方法获取原生 API: + * boss.insert(jobs) / boss.work(name, { teamConcurrency }) / boss.schedule(name, cron) + */ + getNativeBoss(): PgBoss { return this.boss } private handlers: Map = new Map() private started: boolean = false @@ -58,7 +66,7 @@ export class PgBossQueue implements JobQueue { // 维护配置 supervise: true, // 启用监控 - maintenanceIntervalSeconds: 300, // 每5分钟运行维护任务 + maintenanceIntervalSeconds: 30, // 每30秒运行维护任务(保障 schedule cron 按时触发) }) // 🛡️ 全局错误监听:防止未捕获错误导致进程崩溃 diff --git a/backend/src/index.ts b/backend/src/index.ts index e3e09854..bbb39a0e 100644 --- a/backend/src/index.ts +++ b/backend/src/index.ts @@ -23,6 +23,7 @@ import { registerTestRoutes } from './test-platform-api.js'; import { registerScreeningWorkers } from './modules/asl/services/screeningWorker.js'; import { registerResearchWorker } from './modules/asl/workers/researchWorker.js'; import { registerExtractionWorkers } from './modules/dc/tool-b/workers/extractionWorker.js'; +import { registerExtractionWorkers as registerAslExtractionWorkers } from './modules/asl/extraction/workers/index.js'; import { registerParseExcelWorker } from './modules/dc/tool-c/workers/parseExcelWorker.js'; import { registerReviewWorker } from './modules/rvw/workers/reviewWorker.js'; import { jobQueue } from './common/jobs/index.js'; @@ -253,6 +254,10 @@ const start = async () => { registerParseExcelWorker(); logger.info('✅ DC Tool C parse excel worker registered'); + // 注册ASL工具3全文提取Workers(散装派发 + Aggregator) + await registerAslExtractionWorkers(); + logger.info('✅ ASL extraction workers registered (Tool 3)'); + // 注册RVW审稿Worker(包含启动时清理卡住任务) await registerReviewWorker(); logger.info('✅ RVW review worker registered'); diff --git a/backend/src/modules/asl/extraction/__tests__/m1-pipeline-test.ts b/backend/src/modules/asl/extraction/__tests__/m1-pipeline-test.ts new file mode 100644 index 00000000..37cdb38d --- /dev/null +++ b/backend/src/modules/asl/extraction/__tests__/m1-pipeline-test.ts @@ -0,0 +1,499 @@ +/** + * M1 骨架管线端到端验证测试 + * + * 运行方式(需先启动后端服务): + * cd backend && npx tsx src/modules/asl/extraction/__tests__/m1-pipeline-test.ts + * + * 验证阶段: + * Phase 1: DB 模型 + Seed 验证(直连数据库) + * Phase 2: PkbExportService ACL 防腐层验证 + * Phase 3: API 端点验证(需后端运行 + JWT) + * Phase 4: 散装派发 + Worker + Aggregator 全链路验证 + * Phase 5: 幂等 + 幽灵守卫 + 僵尸清理边界验证 + */ + +import { PrismaClient } from '@prisma/client'; +import jwt from 'jsonwebtoken'; +import { aggregatorHandler } from '../workers/ExtractionAggregator.js'; + +const prisma = new PrismaClient(); +const API_BASE = 'http://localhost:3001/api/v1/asl/extraction'; +const JWT_SECRET = process.env.JWT_SECRET || 'your-secret-key-change-in-production'; + +const sleep = (ms: number) => new Promise(r => setTimeout(r, ms)); + +let passed = 0; +let failed = 0; + +function ok(name: string) { + passed++; + console.log(` ✅ ${name}`); +} + +function fail(name: string, detail?: string) { + failed++; + console.log(` ❌ ${name}${detail ? ': ' + detail : ''}`); +} + +function assert(condition: boolean, name: string, detail?: string) { + condition ? ok(name) : fail(name, detail); +} + +function makeTestToken(userId: string, tenantId: string): string { + return jwt.sign( + { userId, phone: '13800000001', role: 'SUPER_ADMIN', tenantId }, + JWT_SECRET, + { expiresIn: '1h', issuer: 'aiclinical', subject: userId }, + ); +} + +async function fetchJSON(path: string, options: RequestInit = {}) { + const resp = await fetch(`${API_BASE}${path}`, { + ...options, + headers: { + 'Content-Type': 'application/json', + ...options.headers, + }, + }); + const data = await resp.json(); + return { status: resp.status, data }; +} + +async function fetchWithAuth(token: string, path: string, options: RequestInit = {}) { + return fetchJSON(path, { + ...options, + headers: { Authorization: `Bearer ${token}`, ...options.headers }, + }); +} + +// ═══════════════════════════════════════════════════════════ +// Phase 1: DB 模型 + Seed 验证 +// ═══════════════════════════════════════════════════════════ +async function phase1() { + console.log('\n📦 Phase 1: DB 模型 + Seed 验证'); + console.log('─'.repeat(50)); + + const templates = await prisma.aslExtractionTemplate.findMany({ orderBy: { code: 'asc' } }); + assert(templates.length === 3, '系统模板数量 = 3', `实际: ${templates.length}`); + + const codes = templates.map(t => t.code); + assert( + codes.includes('Cohort') && codes.includes('QC') && codes.includes('RCT'), + '模板代码包含 Cohort / QC / RCT', + `实际: ${codes.join(', ')}`, + ); + + for (const t of templates) { + const fields = t.baseFields as Record; + assert(!!fields.metadata && !!fields.baseline && !!fields.rob, `${t.code} 包含 metadata/baseline/rob`); + } + + const rct = templates.find(t => t.code === 'RCT')!; + const rctFields = rct.baseFields as Record; + assert( + !!rctFields.outcomes_survival && !!rctFields.outcomes_dichotomous && !!rctFields.outcomes_continuous, + 'RCT 包含三种 outcome 类型', + ); + + const taskColumns = await prisma.$queryRaw` + SELECT column_name FROM information_schema.columns + WHERE table_schema = 'asl_schema' AND table_name = 'extraction_tasks' + ORDER BY ordinal_position + `; + const colNames = taskColumns.map((c: any) => c.column_name); + assert(!colNames.includes('success_count'), 'Task 表无 success_count 冗余字段'); + assert(!colNames.includes('failed_count'), 'Task 表无 failed_count 冗余字段'); + assert(colNames.includes('idempotency_key'), 'Task 表含 idempotency_key'); + + const indexes = await prisma.$queryRaw` + SELECT indexname FROM pg_indexes + WHERE schemaname = 'asl_schema' AND tablename = 'extraction_results' + `; + const idxNames = indexes.map((i: any) => i.indexname); + assert( + idxNames.includes('idx_extraction_results_task_status'), + 'Result 含 [taskId, status] 复合索引(Aggregator 性能保障)', + ); +} + +// ═══════════════════════════════════════════════════════════ +// Phase 2: PkbExportService ACL 防腐层 +// ═══════════════════════════════════════════════════════════ +async function phase2() { + console.log('\n🔌 Phase 2: PkbExportService ACL 防腐层'); + console.log('─'.repeat(50)); + + const anyKb = await prisma.knowledgeBase.findFirst({ + include: { _count: { select: { documents: true } } }, + }); + + if (!anyKb) { + console.log(' ⚠️ PKB 无知识库数据,跳过 Phase 2'); + return null; + } + ok(`找到知识库: ${anyKb.name} (${anyKb._count.documents} 篇)`); + + const doc = await prisma.document.findFirst({ + where: { kbId: anyKb.id }, + select: { id: true, storageKey: true, filename: true, extractedText: true }, + }); + + if (!doc) { + console.log(' ⚠️ 知识库内无文档,跳过文档级测试'); + return { kbId: anyKb.id, userId: anyKb.userId }; + } + + ok(`找到文档: ${doc.filename}`); + assert(!!doc.storageKey, '文档含 storageKey(OSS 路径)'); + + const hasText = !!doc.extractedText && doc.extractedText.trim().length > 0; + if (hasText) { + ok(`文档含 extractedText (${doc.extractedText!.length} 字符)`); + } else { + console.log(' ⚠️ 文档无 extractedText — Worker 将标记为 permanent error'); + } + + return { kbId: anyKb.id, userId: anyKb.userId, docId: doc.id, hasText }; +} + +// ═══════════════════════════════════════════════════════════ +// Phase 3: API 端点验证 +// ═══════════════════════════════════════════════════════════ +async function phase3(pkbData: any) { + console.log('\n🌐 Phase 3: API 端点验证'); + console.log('─'.repeat(50)); + + const admin = await prisma.user.findFirst({ where: { role: 'SUPER_ADMIN' } }); + if (!admin) { + console.log(' ⚠️ 无超管用户,跳过 API 测试'); + return null; + } + + const token = makeTestToken(admin.id, admin.tenantId); + ok(`生成测试 JWT (userId: ${admin.id.slice(0, 8)}...)`); + + // 3.1 GET /templates + try { + const { status, data } = await fetchWithAuth(token, '/templates'); + assert(status === 200, 'GET /templates → 200'); + assert(data.success === true, 'GET /templates → success: true'); + assert(Array.isArray(data.data) && data.data.length === 3, 'GET /templates → 返回 3 套模板'); + } catch (e: any) { + fail('GET /templates', e.message + ' — 后端是否已启动?'); + return null; + } + + // 3.2 GET /templates/:id + const { data: templatesData } = await fetchWithAuth(token, '/templates'); + const rctTemplate = templatesData.data.find((t: any) => t.code === 'RCT'); + { + const { status, data } = await fetchWithAuth(token, `/templates/${rctTemplate.id}`); + assert(status === 200, 'GET /templates/:id → 200'); + assert(data.data?.code === 'RCT', 'GET /templates/:id → code = RCT'); + } + + // 3.3 POST /templates/clone + const projectId = `test-m1-${Date.now()}`; + let projectTemplateId: string | null = null; + { + const { status, data } = await fetchWithAuth(token, '/templates/clone', { + method: 'POST', + body: JSON.stringify({ projectId, baseTemplateId: rctTemplate.id }), + }); + assert(status === 200, 'POST /templates/clone → 200'); + assert(!!data.data?.id, 'POST /templates/clone → 返回 projectTemplateId'); + projectTemplateId = data.data?.id; + } + + // 3.4 幂等验证 + { + const { data } = await fetchWithAuth(token, '/templates/clone', { + method: 'POST', + body: JSON.stringify({ projectId, baseTemplateId: rctTemplate.id }), + }); + assert(data.data?.id === projectTemplateId, 'POST /templates/clone 幂等 → 返回相同 ID'); + } + + // 3.5 GET /knowledge-bases + { + const { status, data } = await fetchWithAuth(token, '/knowledge-bases'); + assert(status === 200, 'GET /knowledge-bases → 200'); + assert(Array.isArray(data.data), 'GET /knowledge-bases → 返回数组'); + } + + if (!pkbData?.kbId || !pkbData?.docId) { + console.log(' ⚠️ 无可用 PKB 文档,跳过任务创建测试'); + return { token, projectId, projectTemplateId }; + } + + // 3.6 GET /knowledge-bases/:kbId/documents + { + const { status, data } = await fetchWithAuth(token, `/knowledge-bases/${pkbData.kbId}/documents`); + assert(status === 200, 'GET /knowledge-bases/:kbId/documents → 200'); + assert(Array.isArray(data.data), 'GET /documents → 返回数组'); + } + + return { token, projectId, projectTemplateId, pkbData }; +} + +// ═══════════════════════════════════════════════════════════ +// Phase 4: 散装派发 + Worker + Aggregator 全链路 +// ═══════════════════════════════════════════════════════════ +async function phase4(ctx: any) { + console.log('\n⚡ Phase 4: 散装派发 + Worker + Aggregator 全链路'); + console.log('─'.repeat(50)); + + if (!ctx?.token || !ctx?.projectTemplateId || !ctx?.pkbData?.docId) { + console.log(' ⚠️ 前置条件不足,跳过全链路测试'); + return null; + } + + const { token, projectId, projectTemplateId, pkbData } = ctx; + const idempotencyKey = `m1-test-${Date.now()}`; + + // 4.1 POST /tasks + let taskId: string; + { + const { status, data } = await fetchWithAuth(token, '/tasks', { + method: 'POST', + body: JSON.stringify({ + projectId, + projectTemplateId, + pkbKnowledgeBaseId: pkbData.kbId, + documentIds: [pkbData.docId], + idempotencyKey, + }), + }); + assert(status === 200, 'POST /tasks → 200'); + taskId = data.taskId || data.data?.taskId; + assert(!!taskId, `POST /tasks → 返回 taskId: ${taskId?.slice(0, 8)}...`); + } + + // 4.2 验证 DB + { + const task = await prisma.aslExtractionTask.findUnique({ where: { id: taskId } }); + assert(!!task, 'DB: Task 记录已创建'); + assert(task!.status === 'processing', `DB: Task.status = processing (实际: ${task!.status})`); + assert(task!.totalCount === 1, 'DB: Task.totalCount = 1'); + + const results = await prisma.aslExtractionResult.findMany({ where: { taskId } }); + assert(results.length === 1, 'DB: 创建了 1 条 Result 记录'); + assert( + results[0].status === 'pending' || results[0].status === 'extracting', + `DB: Result.status 初始为 pending/extracting`, + ); + assert(!!results[0].snapshotStorageKey, 'DB: Result.snapshotStorageKey 快照已冻结'); + assert(!!results[0].snapshotFilename, 'DB: Result.snapshotFilename 快照已冻结'); + } + + // 4.3 轮询等待 Worker 完成(最多 180 秒,LLM 调用可能慢) + console.log(' ⏳ 等待 Worker 处理...'); + let workerDone = false; + for (let i = 0; i < 60; i++) { + await sleep(3000); + const r = await prisma.aslExtractionResult.findFirst({ where: { taskId } }); + process.stdout.write(`\r ⏳ [${(i + 1) * 3}s] Result.status=${r?.status} `); + if (r?.status === 'completed' || r?.status === 'error') { + workerDone = true; + break; + } + } + console.log(); + assert(workerDone, 'Worker 在 180s 内完成处理'); + + // 4.4 直接调用 Aggregator 收口(不等 cron,直接验证逻辑) + console.log(' 🔄 手动触发 Aggregator 收口...'); + await aggregatorHandler(); + + // 4.5 验证 Task 被 Aggregator 收口 + { + const task = await prisma.aslExtractionTask.findUnique({ where: { id: taskId } }); + assert( + task?.status === 'completed' || task?.status === 'failed', + `Aggregator 收口 → Task.status = ${task?.status}`, + ); + assert(!!task?.completedAt, 'Aggregator 设置了 Task.completedAt'); + } + + // 4.6 API 同步确认 + { + const { data } = await fetchWithAuth(token, `/tasks/${taskId}`); + const s = data.data; + assert( + s?.status === 'completed' || s?.status === 'failed', + `GET /tasks/:id → status = ${s?.status}`, + ); + } + + // 4.7 验证 Result 数据 + { + const results = await prisma.aslExtractionResult.findMany({ where: { taskId } }); + const r = results[0]; + if (r.status === 'completed') { + ok('Result.status = completed'); + assert(!!r.extractedData, 'Result.extractedData 已填充'); + assert(!!r.processedAt, 'Result.processedAt 已记录'); + + const data = r.extractedData as any; + assert(!!data.metadata, 'extractedData 包含 metadata 模块'); + console.log(` 📊 提取 Study ID: ${data.metadata?.study_id || '(null)'}`); + } else if (r.status === 'error') { + ok('Result.status = error(文档或 LLM 异常,预期行为)'); + console.log(` 📝 errorMessage: ${r.errorMessage?.slice(0, 100)}`); + } + } + + // 4.8 GET /tasks/:id/results API 验证 + { + const { status, data } = await fetchWithAuth(token, `/tasks/${taskId}/results`); + assert(status === 200, 'GET /tasks/:id/results → 200'); + assert(Array.isArray(data.data), 'GET /tasks/:id/results → 返回数组'); + assert(data.data?.length === 1, 'GET /tasks/:id/results → 1 条记录'); + } + + return { taskId, idempotencyKey }; +} + +// ═══════════════════════════════════════════════════════════ +// Phase 5: 幂等 + 僵尸清理边界 +// ═══════════════════════════════════════════════════════════ +async function phase5(ctx: any, phase4Result: any) { + console.log('\n🛡️ Phase 5: 幂等 + 边界条件验证'); + console.log('─'.repeat(50)); + + if (!ctx?.token) { + console.log(' ⚠️ 无 token,跳过'); + return; + } + const { token, projectId, projectTemplateId, pkbData } = ctx; + + // 5.1 幂等:重复 idempotencyKey + if (phase4Result?.idempotencyKey && pkbData?.docId) { + const { status, data } = await fetchWithAuth(token, '/tasks', { + method: 'POST', + body: JSON.stringify({ + projectId, + projectTemplateId, + pkbKnowledgeBaseId: pkbData.kbId, + documentIds: [pkbData.docId], + idempotencyKey: phase4Result.idempotencyKey, + }), + }); + assert(status === 200, 'POST /tasks 幂等 → 200'); + const returnedTaskId = data.taskId || data.data?.taskId; + assert( + returnedTaskId === phase4Result.taskId, + `幂等返回相同 taskId: ${returnedTaskId?.slice(0, 8)}...`, + ); + } + + // 5.2 空文献边界 + { + const { status, data } = await fetchWithAuth(token, '/tasks', { + method: 'POST', + body: JSON.stringify({ + projectId: `empty-test-${Date.now()}`, + projectTemplateId, + pkbKnowledgeBaseId: 'fake-kb', + documentIds: [], + idempotencyKey: `empty-${Date.now()}`, + }), + }); + assert( + status === 400 || data.success === false, + `空文献 documentIds=[] → 拒绝 (status=${status})`, + ); + } + + // 5.3 Aggregator 僵尸清理(手动模拟 + 直接调用) + { + console.log(' 🧟 模拟僵尸 Result(extracting + 31分钟前)...'); + const zombieTask = await prisma.aslExtractionTask.create({ + data: { + projectId: `zombie-test-${Date.now()}`, + userId: 'test-user', + projectTemplateId: projectTemplateId || 'fake-pt', + pkbKnowledgeBaseId: 'fake-kb', + totalCount: 1, + status: 'processing', + }, + }); + + await prisma.aslExtractionResult.create({ + data: { + taskId: zombieTask.id, + projectId: zombieTask.projectId, + pkbDocumentId: 'fake-doc', + snapshotStorageKey: 'test/zombie.pdf', + snapshotFilename: 'zombie.pdf', + status: 'extracting', + }, + }); + + // 回退 updatedAt 到 31 分钟前 + await prisma.$executeRaw` + UPDATE asl_schema.extraction_results + SET updated_at = NOW() - INTERVAL '31 minutes' + WHERE task_id = ${zombieTask.id} + `; + ok('已创建僵尸 Result(extracting + 31min ago)'); + + // 直接调用 Aggregator + console.log(' 🔄 手动触发 Aggregator 处理僵尸...'); + await aggregatorHandler(); + + const r = await prisma.aslExtractionResult.findFirst({ where: { taskId: zombieTask.id } }); + assert(r?.status === 'error', `Aggregator 僵尸清理 → Result.status = ${r?.status}`); + assert( + r?.errorMessage?.includes('Timeout') || r?.errorMessage?.includes('zombie'), + `僵尸 errorMessage 包含超时信息`, + ); + + const t = await prisma.aslExtractionTask.findUnique({ where: { id: zombieTask.id } }); + assert( + t?.status === 'failed', + `僵尸 Task 被 Aggregator 收口 → ${t?.status}`, + ); + + // 清理 + await prisma.aslExtractionResult.deleteMany({ where: { taskId: zombieTask.id } }); + await prisma.aslExtractionTask.delete({ where: { id: zombieTask.id } }); + ok('已清理僵尸测试数据'); + } +} + +// ═══════════════════════════════════════════════════════════ +// Main +// ═══════════════════════════════════════════════════════════ +async function main() { + console.log('╔════════════════════════════════════════════════════╗'); + console.log('║ 🧪 M1 骨架管线端到端验证测试 ║'); + console.log('║ 工具 3:全文智能提取工作台 V2.0 ║'); + console.log('╚════════════════════════════════════════════════════╝'); + console.log(`⏰ ${new Date().toLocaleString('zh-CN')}`); + console.log(`📍 API: ${API_BASE}`); + + try { + await phase1(); + const pkbData = await phase2(); + const phase3Ctx = await phase3(pkbData); + const phase4Result = await phase4(phase3Ctx); + await phase5(phase3Ctx, phase4Result); + } catch (e: any) { + console.error('\n💥 测试意外中断:', e.message); + console.error(e.stack); + } finally { + await prisma.$disconnect(); + } + + console.log('\n' + '═'.repeat(50)); + console.log(`✅ 通过: ${passed} ❌ 失败: ${failed} 总计: ${passed + failed}`); + if (failed > 0) { + console.log('\n⚠️ 有测试失败,请检查上方日志定位问题。'); + process.exit(1); + } else { + console.log('\n🎉 M1 管线全部验证通过!可以安全合入主分支。'); + } +} + +main(); diff --git a/backend/src/modules/asl/extraction/__tests__/m2-hitl-test.ts b/backend/src/modules/asl/extraction/__tests__/m2-hitl-test.ts new file mode 100644 index 00000000..3b443eff --- /dev/null +++ b/backend/src/modules/asl/extraction/__tests__/m2-hitl-test.ts @@ -0,0 +1,391 @@ +/** + * M2 HITL 工作台集成测试 + * + * 运行方式(需先启动后端服务): + * cd backend && npx tsx src/modules/asl/extraction/__tests__/m2-hitl-test.ts + * + * 验证阶段: + * Phase 1: M2 新增 API 端点验证(结果详情 / 审核 / SSE / 导出) + * Phase 2: DynamicPromptBuilder 单元测试 + * Phase 3: ExtractionValidator fuzzyQuoteMatch 单元测试 + * Phase 4: ExtractionEventBus 单元测试 + * Phase 5: Excel 导出端到端验证 + * Phase 6: 断点恢复(URL → 正确步骤) + */ + +import { PrismaClient } from '@prisma/client'; +import jwt from 'jsonwebtoken'; +import { buildExtractionPrompt } from '../services/DynamicPromptBuilder.js'; +import { extractionValidator } from '../services/ExtractionValidator.js'; +import { extractionEventBus } from '../services/ExtractionEventBus.js'; + +const prisma = new PrismaClient(); +const API_BASE = 'http://localhost:3001/api/v1/asl/extraction'; +const JWT_SECRET = process.env.JWT_SECRET || 'your-secret-key-change-in-production'; + +const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); + +let passed = 0; +let failed = 0; + +function ok(name: string) { + passed++; + console.log(` ✅ ${name}`); +} + +function fail(name: string, reason: string) { + failed++; + console.log(` ❌ ${name}: ${reason}`); +} + +function assert(condition: boolean, name: string, reason = 'Assertion failed') { + condition ? ok(name) : fail(name, reason); +} + +let _cachedToken: string | null = null; + +function makeTestToken(userId: string, tenantId: string): string { + return jwt.sign( + { userId, phone: '13800000001', role: 'SUPER_ADMIN', tenantId }, + JWT_SECRET, + { expiresIn: '1h', issuer: 'aiclinical', subject: userId }, + ); +} + +async function getAuthToken(): Promise { + if (_cachedToken) return _cachedToken; + const admin = await prisma.user.findFirst({ where: { role: 'SUPER_ADMIN' } }); + if (!admin) throw new Error('无 SUPER_ADMIN 用户,无法执行 API 测试'); + _cachedToken = makeTestToken(admin.id, admin.tenantId); + return _cachedToken; +} + +async function fetchWithAuth(path: string, options: RequestInit = {}) { + const token = await getAuthToken(); + return fetch(`${API_BASE}${path}`, { + ...options, + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${token}`, + ...options.headers, + }, + }); +} + +// ══════════════════════════════════════════════════════ +// Phase 1: M2 新增 API 端点验证 +// ══════════════════════════════════════════════════════ + +async function phase1() { + console.log('\n📡 Phase 1: M2 API 端点验证'); + + // 查找一个已有的 completed result + const result = await prisma.aslExtractionResult.findFirst({ + where: { status: 'completed' }, + orderBy: { createdAt: 'desc' }, + }); + + if (!result) { + console.log(' ⚠️ 无已完成的提取结果,跳过 API 端点测试(请先运行 M1 pipeline)'); + return { resultId: null, taskId: null }; + } + + // 1.1 GET /results/:resultId — 结果详情 + const detailRes = await fetchWithAuth(`/results/${result.id}`); + const detailJson = await detailRes.json(); + assert(detailRes.ok && detailJson.success, '获取单条结果详情', `status=${detailRes.status}`); + assert(detailJson.data?.id === result.id, '结果 ID 正确'); + + // 1.2 PUT /results/:resultId/review — 审核 + const reviewRes = await fetchWithAuth(`/results/${result.id}/review`, { + method: 'PUT', + body: JSON.stringify({ reviewStatus: 'approved' }), + }); + const reviewJson = await reviewRes.json(); + assert(reviewRes.ok && reviewJson.success, '审核接口返回成功'); + + // 验证审核状态已更新 + const updatedResult = await prisma.aslExtractionResult.findUnique({ + where: { id: result.id }, + }); + assert(updatedResult?.reviewStatus === 'approved', 'DB reviewStatus 已更新为 approved'); + assert(updatedResult?.reviewedAt !== null, 'DB reviewedAt 已设置'); + + // 1.3 GET /tasks/:taskId/stream — SSE 端点(快速验证连接) + const sseToken = await getAuthToken(); + const sseRes = await fetch(`${API_BASE}/tasks/${result.taskId}/stream?token=${sseToken}`, { + headers: { Authorization: `Bearer ${sseToken}` }, + }); + assert( + sseRes.headers.get('content-type')?.includes('text/event-stream') || sseRes.ok, + 'SSE 端点返回 event-stream', + ); + // 不需要等待完整 SSE 流,关闭连接 + try { + // @ts-ignore + sseRes.body?.cancel?.(); + } catch { /* ok */ } + + // 1.4 GET /tasks/:taskId/export — Excel 导出 + const exportRes = await fetchWithAuth(`/tasks/${result.taskId}/export`); + if (exportRes.ok) { + const blob = await exportRes.blob(); + assert(blob.size > 0, 'Excel 导出成功且非空', `size=${blob.size}`); + } else { + // 可能没有 approved 结果 + const errText = await exportRes.text(); + console.log(` ⚠️ 导出可能无 approved 结果: ${errText}`); + ok('Excel 导出端点可达(无 approved 数据时预期 400)'); + } + + return { resultId: result.id, taskId: result.taskId }; +} + +// ══════════════════════════════════════════════════════ +// Phase 2: DynamicPromptBuilder 单元测试 +// ══════════════════════════════════════════════════════ + +function phase2() { + console.log('\n🧩 Phase 2: DynamicPromptBuilder 单元测试'); + + const schema = { + baseTemplateCode: 'RCT_ONCO', + outcomeType: 'survival', + schema: { + metadata: ['study_id', 'authors', 'year'], + baseline: ['total_n', 'median_age'], + }, + }; + + // 2.1 纯文本模式(无 MinerU 表格) + const result1 = buildExtractionPrompt('This is the full text of the paper.', [], schema); + assert(result1.systemPrompt.includes('clinical research'), 'System prompt 包含角色定义'); + assert(result1.userPrompt.includes(''), 'User prompt 包含 FULL_TEXT 标签'); + assert(!result1.userPrompt.includes(''), '纯文本模式不含 HIGH_FIDELITY_TABLES'); + + // 2.2 MinerU + Markdown 混合模式 + const tables = ['
OS median: 12.3 months
']; + const result2 = buildExtractionPrompt('Full text here.', tables, schema); + assert(result2.userPrompt.includes(''), '混合模式包含 HIGH_FIDELITY_TABLES'); + assert(result2.userPrompt.includes(''), '混合模式包含 FULL_TEXT'); + assert(result2.systemPrompt.includes('AUTHORITATIVE'), 'System prompt 声明表格优先级'); + + // 2.3 Schema 正确嵌入 + assert(result1.userPrompt.includes('RCT_ONCO'), 'Schema study type 正确嵌入'); + assert(result1.userPrompt.includes('"study_id"'), 'Schema 字段正确嵌入'); + + // 2.4 Quote 指令 + assert(result1.userPrompt.includes('quote'), 'Prompt 包含 quote 指令'); +} + +// ══════════════════════════════════════════════════════ +// Phase 3: ExtractionValidator fuzzyQuoteMatch 单元测试 +// ══════════════════════════════════════════════════════ + +function phase3() { + console.log('\n🔍 Phase 3: fuzzyQuoteMatch 单元测试'); + + const sourceText = ` + The median overall survival was 12.3 months (95% CI, 10.1-15.7) in the pembrolizumab group + versus 8.9 months in the placebo group (HR 0.69; 95% CI, 0.56-0.85; P < 0.001). + A total of 305 patients were enrolled across 50 centers. + `; + const normalizedSource = sourceText.toLowerCase().replace(/[\s\u00A0]+/g, ' ').replace(/[^\w\s\u4e00-\u9fff]/g, '').trim(); + + // 3.1 精确匹配 → high + const r1 = extractionValidator.fuzzyQuoteMatch( + sourceText, + normalizedSource, + 'median overall survival was 12.3 months', + ); + assert(r1.confidence === 'high', '精确子串匹配 → high', `got ${r1.confidence}`); + assert(r1.matchScore >= 0.95, '精确匹配 score ≥ 0.95', `got ${r1.matchScore}`); + + // 3.2 空白/标点差异 → high (normalized) + const r2 = extractionValidator.fuzzyQuoteMatch( + sourceText, + normalizedSource, + 'median overall survival was 12.3 months (95% CI, 10.1-15.7)', + ); + assert(r2.confidence === 'high', '标点差异匹配 → high', `got ${r2.confidence}`); + + // 3.3 关键词覆盖 ≥ 80% → medium + const r3 = extractionValidator.fuzzyQuoteMatch( + sourceText, + normalizedSource, + 'overall survival 12.3 months pembrolizumab group versus 8.9 months placebo group', + ); + assert(r3.confidence === 'high' || r3.confidence === 'medium', '高覆盖率关键词匹配 → high/medium', `got ${r3.confidence}`); + + // 3.4 完全不匹配 → low + const r4 = extractionValidator.fuzzyQuoteMatch( + sourceText, + normalizedSource, + 'This quote is completely fabricated by the LLM and has no match whatsoever', + ); + assert(r4.confidence === 'low', '不匹配 → low', `got ${r4.confidence}`); + assert(r4.matchScore < 0.5, '不匹配 score < 0.5', `got ${r4.matchScore}`); + + // 3.5 verifyAllQuotes 集成 + const extractedData = { + metadata: { + study_id: 'Gandhi 2018', + study_id_quote: 'Gandhi 2018', + total_n: 305, + total_n_quote: '305 patients were enrolled across 50 centers', + }, + outcomes: { + os_median: 12.3, + os_median_quote: 'The median overall survival was 12.3 months', + fake_field: 'fake', + fake_field_quote: 'completely fabricated hallucination not in source text at all', + }, + }; + + const scope = extractionValidator.buildQuoteSearchScope(sourceText, []); + const verification = extractionValidator.verifyAllQuotes(extractedData, scope); + + assert(verification.metadata?.total_n?.confidence === 'high', 'verifyAllQuotes: total_n → high', `got ${verification.metadata?.total_n?.confidence}`); + assert(verification.outcomes?.os_median?.confidence === 'high', 'verifyAllQuotes: os_median → high', `got ${verification.outcomes?.os_median?.confidence}`); + assert(verification.outcomes?.fake_field?.confidence === 'low', 'verifyAllQuotes: fake_field → low', `got ${verification.outcomes?.fake_field?.confidence}`); + + // 3.6 buildQuoteSearchScope 含 HTML 表格 + const tableHtml = '
PFS median 6.9 months
'; + const scopeWithTable = extractionValidator.buildQuoteSearchScope('Full text.', [tableHtml]); + assert(scopeWithTable.includes('PFS median 6.9 months'), 'searchScope 包含 HTML 表格纯文本'); + assert(!scopeWithTable.includes(''), 'searchScope 不含 HTML 标签'); +} + +// ══════════════════════════════════════════════════════ +// Phase 4: ExtractionEventBus 单元测试 +// ══════════════════════════════════════════════════════ + +async function phase4() { + console.log('\n📢 Phase 4: ExtractionEventBus 单元测试'); + + const testTaskId = 'test-eventbus-' + Date.now(); + const received: any[] = []; + + // 4.1 订阅 + 发送 + const unsub = extractionEventBus.subscribe(testTaskId, (entry) => { + received.push(entry); + }); + + extractionEventBus.emit(testTaskId, { source: 'MinerU', message: 'Processing page 1', level: 'info' }); + extractionEventBus.emit(testTaskId, { source: 'DeepSeek', message: 'Extracting fields', level: 'info' }); + extractionEventBus.emit(testTaskId, { source: 'System', message: 'Error occurred', level: 'error' }); + + await sleep(50); + + assert(received.length === 3, 'EventBus 收到 3 条消息', `got ${received.length}`); + assert(received[0].source === 'MinerU', 'EventBus 消息 source 正确'); + assert(received[0].timestamp !== undefined, 'EventBus 自动添加 timestamp'); + + // 4.2 getRecentLogs + const recent = extractionEventBus.getRecentLogs(testTaskId); + assert(recent.length === 3, 'getRecentLogs 返回 3 条', `got ${recent.length}`); + + // 4.3 取消订阅 + unsub(); + extractionEventBus.emit(testTaskId, { source: 'System', message: 'After unsub', level: 'info' }); + await sleep(50); + assert(received.length === 3, '取消订阅后不再接收', `got ${received.length}`); + + // 4.4 cleanup + extractionEventBus.cleanup(testTaskId); + const afterCleanup = extractionEventBus.getRecentLogs(testTaskId); + assert(afterCleanup.length === 0, 'cleanup 后日志清空'); +} + +// ══════════════════════════════════════════════════════ +// Phase 5: Excel 导出端到端验证 +// ══════════════════════════════════════════════════════ + +async function phase5(ctx: { taskId: string | null }) { + console.log('\n📊 Phase 5: Excel 导出端到端验证'); + + if (!ctx.taskId) { + console.log(' ⚠️ 无可用 taskId,跳过导出测试'); + return; + } + + // 确保至少有一个 approved result + const approvedCount = await prisma.aslExtractionResult.count({ + where: { taskId: ctx.taskId, reviewStatus: 'approved' }, + }); + + if (approvedCount === 0) { + console.log(' ⚠️ 无 approved 结果,跳过导出测试'); + return; + } + + const exportRes = await fetchWithAuth(`/tasks/${ctx.taskId}/export`); + assert(exportRes.ok, 'Excel 导出 HTTP 200'); + + const contentType = exportRes.headers.get('content-type') || ''; + assert( + contentType.includes('spreadsheet') || contentType.includes('octet-stream'), + 'Content-Type 为 Excel 格式', + `got ${contentType}`, + ); + + const disposition = exportRes.headers.get('content-disposition') || ''; + assert(disposition.includes('.xlsx'), 'Content-Disposition 包含 .xlsx', `got ${disposition}`); + + const blob = await exportRes.blob(); + assert(blob.size > 100, `Excel 文件大小合理 (${blob.size} bytes)`); +} + +// ══════════════════════════════════════════════════════ +// Phase 6: 断点恢复路由验证 +// ══════════════════════════════════════════════════════ + +function phase6() { + console.log('\n🔄 Phase 6: 断点恢复路由设计验证'); + + // 验证路由结构设计正确性(不实际测试前端路由,只验证约定) + const routes = [ + '/literature/extraction/setup', + '/literature/extraction/progress/some-task-id', + '/literature/extraction/workbench/some-task-id', + ]; + + for (const route of routes) { + assert(route.startsWith('/literature/extraction/'), `路由前缀正确: ${route}`); + } + + assert(routes[1].includes('/progress/'), 'Progress 路由包含 taskId'); + assert(routes[2].includes('/workbench/'), 'Workbench 路由包含 taskId'); + ok('断点恢复路由设计正确(刷新后 URL 可定位到正确步骤 + taskId)'); +} + +// ══════════════════════════════════════════════════════ +// Main +// ══════════════════════════════════════════════════════ + +async function main() { + console.log('═══════════════════════════════════════════'); + console.log(' M2 HITL 工作台集成测试'); + console.log('═══════════════════════════════════════════'); + + try { + const ctx = await phase1(); + phase2(); + phase3(); + await phase4(); + await phase5(ctx); + phase6(); + } catch (error: any) { + console.error('\n💥 未捕获异常:', error.message); + failed++; + } finally { + await prisma.$disconnect(); + } + + console.log('\n═══════════════════════════════════════════'); + console.log(` 结果: ✅ ${passed} 通过, ❌ ${failed} 失败`); + console.log('═══════════════════════════════════════════'); + process.exit(failed > 0 ? 1 : 0); +} + +main(); diff --git a/backend/src/modules/asl/extraction/controllers/ExtractionController.ts b/backend/src/modules/asl/extraction/controllers/ExtractionController.ts new file mode 100644 index 00000000..f377f363 --- /dev/null +++ b/backend/src/modules/asl/extraction/controllers/ExtractionController.ts @@ -0,0 +1,244 @@ +import { FastifyRequest, FastifyReply } from 'fastify'; +import { prisma } from '../../../../config/database.js'; +import { templateService } from '../services/TemplateService.js'; +import { extractionService } from '../services/ExtractionService.js'; +import { pkbBridgeService } from '../services/PkbBridgeService.js'; +import { extractionEventBus } from '../services/ExtractionEventBus.js'; +import { extractionExcelExporter } from '../services/ExtractionExcelExporter.js'; +import { logger } from '../../../../common/logging/index.js'; + +function getUserId(request: FastifyRequest): string { + const userId = (request as any).user?.userId; + if (!userId) throw new Error('User not authenticated'); + return userId; +} + +/** + * 工具 3 全文提取 API 控制器 + */ + +// ═══════════════════════════════════════════ +// 模板 API +// ═══════════════════════════════════════════ + +export async function listTemplates(request: FastifyRequest, reply: FastifyReply) { + const templates = await templateService.listSystemTemplates(); + return reply.send({ success: true, data: templates }); +} + +export async function getTemplate( + request: FastifyRequest<{ Params: { templateId: string } }>, + reply: FastifyReply, +) { + const template = await templateService.getSystemTemplate(request.params.templateId); + return reply.send({ success: true, data: template }); +} + +export async function cloneTemplate( + request: FastifyRequest<{ Body: { projectId: string; baseTemplateId: string } }>, + reply: FastifyReply, +) { + const userId = getUserId(request); + const { projectId, baseTemplateId } = request.body; + const projectTemplate = await templateService.cloneToProject(projectId, baseTemplateId, userId); + return reply.send({ success: true, data: projectTemplate }); +} + +// ═══════════════════════════════════════════ +// 提取任务 API +// ═══════════════════════════════════════════ + +export async function createTask( + request: FastifyRequest<{ + Body: { + projectId: string; + projectTemplateId: string; + pkbKnowledgeBaseId: string; + documentIds: string[]; + idempotencyKey?: string; + }; + }>, + reply: FastifyReply, +) { + const userId = getUserId(request); + const { projectId, projectTemplateId, pkbKnowledgeBaseId, documentIds, idempotencyKey } = request.body; + + const result = await extractionService.createTask({ + projectId, + userId, + projectTemplateId, + pkbKnowledgeBaseId, + documentIds, + idempotencyKey, + pkbBridge: pkbBridgeService, + }); + + return reply.send({ success: true, ...result }); +} + +export async function getTaskStatus( + request: FastifyRequest<{ Params: { taskId: string } }>, + reply: FastifyReply, +) { + const status = await extractionService.getTaskStatus(request.params.taskId); + return reply.send({ success: true, data: status }); +} + +export async function getTaskResults( + request: FastifyRequest<{ Params: { taskId: string } }>, + reply: FastifyReply, +) { + const results = await extractionService.getResults(request.params.taskId); + return reply.send({ success: true, data: results }); +} + +// ═══════════════════════════════════════════ +// PKB 数据代理 API(前端通过 ASL 访问,不直接调 PKB) +// ═══════════════════════════════════════════ + +export async function listKnowledgeBases( + request: FastifyRequest, + reply: FastifyReply, +) { + const userId = getUserId(request); + const kbs = await pkbBridgeService.listKnowledgeBases(userId); + return reply.send({ success: true, data: kbs }); +} + +export async function listDocuments( + request: FastifyRequest<{ Params: { kbId: string } }>, + reply: FastifyReply, +) { + const docs = await pkbBridgeService.listPdfDocuments(request.params.kbId); + return reply.send({ success: true, data: docs }); +} + +// ═══════════════════════════════════════════ +// 单条提取结果详情 + 审核 API(M2 新增) +// ═══════════════════════════════════════════ + +export async function getResultDetail( + request: FastifyRequest<{ Params: { resultId: string } }>, + reply: FastifyReply, +) { + const result = await prisma.aslExtractionResult.findUnique({ + where: { id: request.params.resultId }, + include: { + task: { + select: { + projectTemplate: { + include: { baseTemplate: true }, + }, + }, + }, + }, + }); + if (!result) { + return reply.status(404).send({ success: false, error: 'Result not found' }); + } + + const baseFields = result.task?.projectTemplate?.baseTemplate?.baseFields as Record | undefined; + const outcomeType = result.task?.projectTemplate?.outcomeType || 'survival'; + + // Build schema (filtered by outcomeType, same logic as TemplateService.assembleFullSchema) + let schema: Record | undefined; + if (baseFields) { + schema = {}; + for (const [mod, fields] of Object.entries(baseFields)) { + if (mod.startsWith('outcomes_') && mod !== `outcomes_${outcomeType}`) continue; + schema[mod] = fields; + } + } + + return reply.send({ + success: true, + data: { + id: result.id, + pkbDocumentId: result.pkbDocumentId, + snapshotFilename: result.snapshotFilename, + snapshotStorageKey: result.snapshotStorageKey, + status: result.status, + reviewStatus: result.reviewStatus, + extractedData: result.extractedData, + quoteVerification: result.quoteVerification, + errorMessage: result.errorMessage, + processedAt: result.processedAt, + createdAt: result.createdAt, + schema, + outcomeType, + }, + }); +} + +export async function reviewResult( + request: FastifyRequest<{ + Params: { resultId: string }; + Body: { reviewStatus: 'approved' | 'rejected' }; + }>, + reply: FastifyReply, +) { + const { reviewStatus } = request.body; + + const updated = await prisma.aslExtractionResult.update({ + where: { id: request.params.resultId }, + data: { + reviewStatus, + reviewedAt: new Date(), + }, + }); + return reply.send({ success: true, data: updated }); +} + +// ═══════════════════════════════════════════ +// SSE 日志流端点(M2 新增) +// ═══════════════════════════════════════════ + +export async function streamTaskLogs( + request: FastifyRequest<{ Params: { taskId: string }; Querystring: { token?: string } }>, + reply: FastifyReply, +) { + const { taskId } = request.params; + + reply.raw.writeHead(200, { + 'Content-Type': 'text/event-stream', + 'Cache-Control': 'no-cache', + Connection: 'keep-alive', + 'X-Accel-Buffering': 'no', + }); + + // 首帧同步历史日志 + const recentLogs = extractionEventBus.getRecentLogs(taskId); + reply.raw.write(`event: sync\ndata: ${JSON.stringify({ logs: recentLogs })}\n\n`); + + // 订阅实时日志 + const unsubscribe = extractionEventBus.subscribe(taskId, (entry) => { + reply.raw.write(`event: log\ndata: ${JSON.stringify(entry)}\n\n`); + }); + + // 心跳防断(每 15 秒) + const heartbeat = setInterval(() => { + reply.raw.write(':heartbeat\n\n'); + }, 15_000); + + // 客户端断开清理 + request.raw.on('close', () => { + clearInterval(heartbeat); + unsubscribe(); + }); +} + +// ═══════════════════════════════════════════ +// Excel 导出端点(M2 新增) +// ═══════════════════════════════════════════ + +export async function exportTaskResults( + request: FastifyRequest<{ Params: { taskId: string } }>, + reply: FastifyReply, +) { + const { taskId } = request.params; + const buffer = await extractionExcelExporter.exportToExcel(taskId); + + reply.header('Content-Type', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'); + reply.header('Content-Disposition', `attachment; filename="extraction-${taskId}.xlsx"`); + return reply.send(buffer); +} diff --git a/backend/src/modules/asl/extraction/routes/index.ts b/backend/src/modules/asl/extraction/routes/index.ts new file mode 100644 index 00000000..774c4c43 --- /dev/null +++ b/backend/src/modules/asl/extraction/routes/index.ts @@ -0,0 +1,40 @@ +import { FastifyInstance } from 'fastify'; +import { authenticate, requireModule } from '../../../../common/auth/auth.middleware.js'; +import * as ctrl from '../controllers/ExtractionController.js'; + +/** + * 工具 3 全文提取路由 + * 前缀:/api/v1/asl/extraction(在主路由中注册) + */ +export async function extractionRoutes(fastify: FastifyInstance) { + // SSE 端点独立封装:EventSource 无法设置 Authorization 头, + // 必须在 register() 中单独注册才能跳过 plugin 级 addHook + fastify.register(async function sseRoutes(sub) { + sub.get('/tasks/:taskId/stream', ctrl.streamTaskLogs); + }); + + // 认证保护的路由在独立封装中 + fastify.register(async function authedRoutes(sub) { + sub.addHook('onRequest', authenticate); + sub.addHook('onRequest', requireModule('ASL')); + + // ── 模板 API ────────────────────────────── + sub.get('/templates', ctrl.listTemplates); + sub.get('/templates/:templateId', ctrl.getTemplate); + sub.post('/templates/clone', ctrl.cloneTemplate); + + // ── 提取任务 API ────────────────────────── + sub.post('/tasks', ctrl.createTask); + sub.get('/tasks/:taskId', ctrl.getTaskStatus); + sub.get('/tasks/:taskId/results', ctrl.getTaskResults); + sub.get('/tasks/:taskId/export', ctrl.exportTaskResults); + + // ── 单条提取结果 API(M2 新增)──────────── + sub.get('/results/:resultId', ctrl.getResultDetail); + sub.put('/results/:resultId/review', ctrl.reviewResult); + + // ── PKB 数据代理 API ────────────────────── + sub.get('/knowledge-bases', ctrl.listKnowledgeBases); + sub.get('/knowledge-bases/:kbId/documents', ctrl.listDocuments); + }); +} diff --git a/backend/src/modules/asl/extraction/services/DynamicPromptBuilder.ts b/backend/src/modules/asl/extraction/services/DynamicPromptBuilder.ts new file mode 100644 index 00000000..f8e04e98 --- /dev/null +++ b/backend/src/modules/asl/extraction/services/DynamicPromptBuilder.ts @@ -0,0 +1,130 @@ +/** + * XML 隔离 Prompt 构建器 + * + * 将 MinerU 高保真表格 HTML 和 pymupdf4llm Markdown 全文 + * 分别包裹在 XML 标签中,避免大模型混淆两种格式。 + * + * 关键设计: + * 1. 将 schema 数组 [{key,type,label,description},...] 转换为 + * LLM 可直接理解的扁平 JSON 模板 + 具体输出示例 + * 2. System Prompt 声明表格优先级规则 + * 3. 明确 study_id 为"第一作者 年份"格式 + */ + +interface SchemaInput { + baseTemplateCode: string; + outcomeType: string; + schema: Record; +} + +interface PromptOutput { + systemPrompt: string; + userPrompt: string; +} + +/** + * 将 schema 字段数组转换为扁平 JSON 模板(含 _quote 字段) + * 输入: { metadata: [{key:"study_id", type:"string"}, ...], baseline: [...] } + * 输出: { metadata: { study_id: "", study_id_quote: "", ... }, baseline: { ... } } + */ +function buildOutputTemplate(schema: Record): Record> { + const template: Record> = {}; + for (const [module, fields] of Object.entries(schema)) { + if (!Array.isArray(fields)) continue; + const mod: Record = {}; + for (const f of fields) { + if (!f.key) continue; + const typePlaceholder = f.type === 'number' ? '' : f.type === 'integer' ? '' : ''; + mod[f.key] = typePlaceholder; + mod[`${f.key}_quote`] = ''; + } + template[module] = mod; + } + return template; +} + +function buildFieldDescriptions(schema: Record): string { + const lines: string[] = []; + for (const [module, fields] of Object.entries(schema)) { + if (!Array.isArray(fields)) continue; + lines.push(`\n### ${module}`); + for (const f of fields) { + if (!f.key) continue; + const desc = f.description ? ` — ${f.description}` : ''; + lines.push(`- **${f.key}** (${f.type || 'string'}): ${f.label || f.key}${desc}`); + } + } + return lines.join('\n'); +} + +export function buildExtractionPrompt( + fullMarkdown: string, + tableHtmls: string[], + schema: SchemaInput, +): PromptOutput { + const hasHighFidelityTables = tableHtmls.length > 0; + const outputTemplate = buildOutputTemplate(schema.schema); + const fieldDescriptions = buildFieldDescriptions(schema.schema); + + const systemPrompt = `You are a clinical research data extraction expert with specialized training in evidence-based medicine and systematic review methodology. + +## Your Task +Extract structured data from a medical research paper and return a FLAT JSON object. + +## Data Source Priority Rules +${hasHighFidelityTables ? `CRITICAL: The input contains TWO data representations: +1. — VLM-extracted tables from the PDF. These are HIGH-PRECISION and should be treated as the AUTHORITATIVE source for all tabular data (baseline characteristics, outcomes, subgroup analyses). +2. — OCR-extracted full text in Markdown format. Use this for narrative data (study design, methodology, follow-up duration) and as context for the tables. + +When the same data appears in BOTH sources, ALWAYS prefer .` : `The input contains — OCR-extracted full text in Markdown format. Extract all fields from this text.`} + +## CRITICAL Output Format Rules +1. Return ONLY a valid JSON object. NO markdown fences. NO explanation. NO preamble. +2. The JSON has top-level keys for each module (e.g., "metadata", "baseline", "rob", "outcomes_${schema.outcomeType}"). +3. Each module is a FLAT OBJECT with key-value pairs — NOT an array. +4. For EVERY field, provide both the value and a corresponding "_quote" field: + - "field_key": extracted_value + - "field_key_quote": "exact 15-50 word quote from the source text supporting this value" +5. Use null for fields that cannot be found in the text. Set the _quote to null as well. +6. For numeric fields (type: number/integer), return the raw number without units. Include units in the quote. + +## Field-Specific Rules +- **study_id**: MUST be formatted as "FirstAuthor Year" (e.g., "Gandhi 2018", "McCarney 2008"). Extract the first/lead author's surname and publication year. +- **study_design**: Use standard terminology (e.g., "Randomised controlled trial", "Prospective cohort study"). +- **age_***: Report as written in the paper (e.g., "79.3 ± 7.0 years" or "Median 65, IQR 58-72"). +- **male_percent**: Extract overall or per-group male percentage as a number. +- **rob_* (Risk of Bias)**: Assess based on the paper's methodology. For RCT use RoB 2.0 domains, for Cohort use NOS criteria. If the paper does not explicitly state bias assessment, evaluate based on reported methods and state your assessment with justification in the quote.`; + + let userPrompt = `## Field Definitions +${fieldDescriptions} + +## Expected Output Structure +Return a JSON object EXACTLY matching this structure (flat key-value per module): +\`\`\` +${JSON.stringify(outputTemplate, null, 2)} +\`\`\` + +## Study Type: ${schema.baseTemplateCode} +## Outcome Type: ${schema.outcomeType} + +`; + + if (hasHighFidelityTables) { + const tablesContent = tableHtmls + .map((html, i) => `\n${html}`) + .join('\n\n'); + userPrompt += ` +${tablesContent} + + +`; + } + + userPrompt += ` +${fullMarkdown.slice(0, 55000)} + + +Return the JSON object now. Remember: FLAT key-value format per module, NOT arrays. No markdown fences.`; + + return { systemPrompt, userPrompt }; +} diff --git a/backend/src/modules/asl/extraction/services/ExtractionEventBus.ts b/backend/src/modules/asl/extraction/services/ExtractionEventBus.ts new file mode 100644 index 00000000..bdf6f1fa --- /dev/null +++ b/backend/src/modules/asl/extraction/services/ExtractionEventBus.ts @@ -0,0 +1,72 @@ +/** + * 提取日志事件总线 — 本 Pod 内存级 EventEmitter + * + * Worker 通过 emit() 发送日志事件, + * SSE 端点通过 subscribe() 监听并推送给前端。 + * + * 不跨 Pod,不持久化。如果 SSE 断开,前端仍靠 React Query 轮询驱动进度。 + */ + +import { EventEmitter } from 'events'; + +export interface ExtractionLogEntry { + source: string; // 'MinerU' | 'DeepSeek' | 'System' | 'Aggregator' | 'Worker' + message: string; + level: 'info' | 'warn' | 'error'; + timestamp?: string; +} + +class ExtractionEventBusImpl { + private emitter = new EventEmitter(); + private recentLogs = new Map(); + private readonly MAX_RECENT = 200; + + constructor() { + this.emitter.setMaxListeners(100); + } + + emit(taskId: string, entry: Omit) { + const logEntry: ExtractionLogEntry = { + ...entry, + timestamp: new Date().toISOString().slice(11, 19), + }; + + let logs = this.recentLogs.get(taskId); + if (!logs) { + logs = []; + this.recentLogs.set(taskId, logs); + } + logs.push(logEntry); + if (logs.length > this.MAX_RECENT) { + logs.splice(0, logs.length - this.MAX_RECENT); + } + + this.emitter.emit(`task:${taskId}`, logEntry); + } + + /** + * 订阅某个任务的日志事件 + * @returns unsubscribe 函数 + */ + subscribe( + taskId: string, + listener: (entry: ExtractionLogEntry) => void, + ): () => void { + const eventName = `task:${taskId}`; + this.emitter.on(eventName, listener); + return () => { + this.emitter.off(eventName, listener); + }; + } + + getRecentLogs(taskId: string): ExtractionLogEntry[] { + return this.recentLogs.get(taskId) || []; + } + + cleanup(taskId: string) { + this.recentLogs.delete(taskId); + this.emitter.removeAllListeners(`task:${taskId}`); + } +} + +export const extractionEventBus = new ExtractionEventBusImpl(); diff --git a/backend/src/modules/asl/extraction/services/ExtractionExcelExporter.ts b/backend/src/modules/asl/extraction/services/ExtractionExcelExporter.ts new file mode 100644 index 00000000..7a2a8323 --- /dev/null +++ b/backend/src/modules/asl/extraction/services/ExtractionExcelExporter.ts @@ -0,0 +1,207 @@ +/** + * 提取结果 Excel 宽表导出 + * + * 变量列 + Quote 列交替,双行表头,仅导出 Approved 结果 + * 使用 exceljs 生成 .xlsx + * + * 关键:LLM 返回的 extractedData 是数组格式: + * { metadata: [{key, value, quote}, ...], baseline: [{key, value, quote}, ...] } + * 导出前需要归一化为扁平 key-value 格式 + */ + +import ExcelJS from 'exceljs'; +import { prisma } from '../../../../config/database.js'; +import { logger } from '../../../../common/logging/index.js'; + +const moduleOrder = ['metadata', 'baseline', 'rob', 'outcomes']; +const moduleLabels: Record = { + metadata: 'Metadata', + baseline: 'Baseline', + rob: 'Risk of Bias', + outcomes: 'Outcomes', +}; + +/** + * 将 LLM 返回的模块数据归一化为扁平 key-value 映射 + * + * 输入可能是: + * 数组: [{key: "study_id", value: "xxx", quote: "..."}, ...] + * 对象: {study_id: "xxx", study_id_quote: "..."} + * 对象: {study_id: {value: "xxx", quote: "..."}} + */ +function flattenModuleData(moduleData: any): Record { + if (!moduleData) return {}; + + if (Array.isArray(moduleData)) { + const flat: Record = {}; + for (const item of moduleData) { + if (typeof item === 'object' && item !== null && 'key' in item) { + flat[item.key] = item.value ?? null; + if (item.quote) flat[`${item.key}_quote`] = item.quote; + } + } + return flat; + } + + if (typeof moduleData === 'object' && moduleData !== null) { + const flat: Record = {}; + for (const [k, v] of Object.entries(moduleData)) { + if (typeof v === 'object' && v !== null && !Array.isArray(v) && 'value' in v) { + flat[k] = (v as any).value ?? null; + if ((v as any).quote) flat[`${k}_quote`] = (v as any).quote; + } else { + flat[k] = v; + } + } + return flat; + } + + return {}; +} + +class ExtractionExcelExporterImpl { + async exportToExcel(taskId: string): Promise { + const results = await prisma.aslExtractionResult.findMany({ + where: { taskId, reviewStatus: 'approved', status: 'completed' }, + orderBy: { createdAt: 'asc' }, + }); + + if (results.length === 0) { + throw Object.assign(new Error('No approved results to export'), { statusCode: 400 }); + } + + // Flatten all results and collect field keys per module + const flattenedResults: Array<{ filename: string; modules: Record> }> = []; + const fieldKeysByModule = new Map(); + + for (const r of results) { + const data = r.extractedData as Record | null; + if (!data) { + flattenedResults.push({ filename: r.snapshotFilename, modules: {} }); + continue; + } + + const modules: Record> = {}; + for (const [mod, fields] of Object.entries(data)) { + const moduleName = mod.startsWith('outcomes_') ? 'outcomes' : mod; + const flat = flattenModuleData(fields); + modules[moduleName] = flat; + + if (!fieldKeysByModule.has(moduleName)) { + fieldKeysByModule.set(moduleName, []); + } + const existing = fieldKeysByModule.get(moduleName)!; + for (const key of Object.keys(flat)) { + if (!key.endsWith('_quote') && !existing.includes(key)) { + existing.push(key); + } + } + } + flattenedResults.push({ filename: r.snapshotFilename, modules }); + } + + // Build ordered column list + const columns: Array<{ module: string; key: string; label: string }> = []; + for (const mod of moduleOrder) { + const keys = fieldKeysByModule.get(mod); + if (!keys) continue; + for (const key of keys) { + columns.push({ module: mod, key, label: this.humanLabel(key) }); + } + } + + const workbook = new ExcelJS.Workbook(); + const sheet = workbook.addWorksheet('Extraction Results'); + + // Row 1: Module group headers (merged) + const headerRow1: string[] = ['#', '']; + const headerRow2: string[] = ['#', 'Study']; + + for (const col of columns) { + headerRow1.push(moduleLabels[col.module] || col.module, ''); + headerRow2.push(col.label, `${col.label} (Quote)`); + } + + sheet.addRow(headerRow1); + sheet.addRow(headerRow2); + + // Merge module header cells + let colIdx = 3; + for (const _col of columns) { + sheet.mergeCells(1, colIdx, 1, colIdx + 1); + colIdx += 2; + } + + // Style headers + const headerFill: ExcelJS.Fill = { + type: 'pattern', + pattern: 'solid', + fgColor: { argb: '4472C4' }, + }; + const headerFont: Partial = { color: { argb: 'FFFFFF' }, bold: true, size: 10 }; + + [1, 2].forEach((rowNum) => { + const row = sheet.getRow(rowNum); + row.eachCell((cell) => { + cell.fill = headerFill; + cell.font = headerFont; + cell.alignment = { horizontal: 'center', vertical: 'middle', wrapText: true }; + }); + }); + + // Data rows + for (let i = 0; i < flattenedResults.length; i++) { + const fr = flattenedResults[i]; + const rowValues: any[] = [i + 1, fr.filename]; + + for (const col of columns) { + const modData = fr.modules[col.module] || {}; + const value = modData[col.key] ?? ''; + const quote = modData[`${col.key}_quote`] ?? ''; + rowValues.push(this.formatCellValue(value), this.formatCellValue(quote)); + } + + const row = sheet.addRow(rowValues); + if (i % 2 === 1) { + row.eachCell((cell) => { + cell.fill = { + type: 'pattern', + pattern: 'solid', + fgColor: { argb: 'F2F7FC' }, + }; + }); + } + } + + // Column widths + sheet.getColumn(1).width = 5; + sheet.getColumn(2).width = 35; + for (let i = 3; i <= 2 + columns.length * 2; i++) { + sheet.getColumn(i).width = i % 2 === 1 ? 18 : 35; + } + + const buf = await workbook.xlsx.writeBuffer(); + logger.info('[ExcelExporter] Export completed', { + taskId, + rows: flattenedResults.length, + columns: columns.length, + }); + return Buffer.from(buf); + } + + private humanLabel(key: string): string { + return key + .replace(/_/g, ' ') + .replace(/\b\w/g, (c) => c.toUpperCase()); + } + + private formatCellValue(val: any): string { + if (val === null || val === undefined) return ''; + if (typeof val === 'object' && val !== null && 'value' in val) return this.formatCellValue(val.value); + if (Array.isArray(val)) return val.map((v) => this.formatCellValue(v)).join(', '); + if (typeof val === 'object') return JSON.stringify(val); + return String(val); + } +} + +export const extractionExcelExporter = new ExtractionExcelExporterImpl(); diff --git a/backend/src/modules/asl/extraction/services/ExtractionService.ts b/backend/src/modules/asl/extraction/services/ExtractionService.ts new file mode 100644 index 00000000..f0aef806 --- /dev/null +++ b/backend/src/modules/asl/extraction/services/ExtractionService.ts @@ -0,0 +1,176 @@ +import { prisma } from '../../../../config/database.js'; +import { logger } from '../../../../common/logging/index.js'; +import { jobQueue } from '../../../../common/jobs/index.js'; +import { PgBossQueue } from '../../../../common/jobs/PgBossQueue.js'; +import { templateService } from './TemplateService.js'; + +/** + * 提取任务服务 + * 核心:API 层散装派发(无 Manager),Worker 只写自己的 Result + */ +export class ExtractionService { + /** + * 创建提取任务 + 散装派发 N 个独立 Job + * + * 三步走: + * 1. DB 级幂等创建 Task + * 2. PKB 快照冻结 → createMany Result + * 3. 散装派发 N 个 pg-boss Job + */ + async createTask(params: { + projectId: string; + userId: string; + projectTemplateId: string; + pkbKnowledgeBaseId: string; + documentIds: string[]; + idempotencyKey?: string; + pkbBridge: { + getDocumentDetail: (docId: string) => Promise<{ + documentId: string; + storageKey: string; + filename: string; + }>; + }; + }) { + const { + projectId, userId, projectTemplateId, + pkbKnowledgeBaseId, documentIds, idempotencyKey, pkbBridge, + } = params; + + if (documentIds.length === 0) { + throw Object.assign(new Error('No documents selected'), { statusCode: 400 }); + } + + // 锁定模板 + await templateService.lockTemplate(projectTemplateId); + + // DB 级幂等:@unique 索引 + P2002 冲突捕获 + let task; + try { + task = await prisma.aslExtractionTask.create({ + data: { + projectId, + userId, + projectTemplateId, + pkbKnowledgeBaseId, + totalCount: documentIds.length, + status: 'processing', + idempotencyKey: idempotencyKey || undefined, + }, + }); + } catch (error: any) { + if (error.code === 'P2002' && idempotencyKey) { + const existing = await prisma.aslExtractionTask.findFirst({ + where: { idempotencyKey }, + }); + if (existing) { + return { taskId: existing.id, note: 'Idempotent return' }; + } + } + throw error; + } + + // PKB 快照冻结:提取可能持续 50 分钟,期间用户可能在 PKB 删除/修改文档 + const pkbDocs = await Promise.all( + documentIds.map(id => pkbBridge.getDocumentDetail(id)) + ); + + const resultsData = pkbDocs.map(doc => ({ + taskId: task.id, + projectId, + pkbDocumentId: doc.documentId, + snapshotStorageKey: doc.storageKey, + snapshotFilename: doc.filename, + status: 'pending', + })); + await prisma.aslExtractionResult.createMany({ data: resultsData }); + + const createdResults = await prisma.aslExtractionResult.findMany({ + where: { taskId: task.id }, + }); + + // 散装派发:N 个独立 Job 一次入队(pg-boss 原生 API) + const boss = (jobQueue as PgBossQueue).getNativeBoss(); + const jobs = createdResults.map(result => ({ + data: { + resultId: result.id, + taskId: task.id, + pkbDocumentId: result.pkbDocumentId, + }, + retryLimit: 3, + retryBackoff: true, + expireInSeconds: 30 * 60, + singletonKey: `extract-${result.id}`, + })); + await boss.insert('asl_extract_single', jobs); + + logger.info('[ExtractionService] Task created with scatter dispatch', { + taskId: task.id, + documentCount: documentIds.length, + jobsDispatched: jobs.length, + }); + + return { taskId: task.id }; + } + + /** + * 获取任务状态(groupBy 聚合 Result 状态,无冗余计数字段) + */ + async getTaskStatus(taskId: string) { + const task = await prisma.aslExtractionTask.findUnique({ + where: { id: taskId }, + }); + if (!task) throw Object.assign(new Error('Task not found'), { statusCode: 404 }); + + const stats = await prisma.aslExtractionResult.groupBy({ + by: ['status'], + where: { taskId }, + _count: true, + }); + + const completedCount = stats.find(s => s.status === 'completed')?._count ?? 0; + const errorCount = stats.find(s => s.status === 'error')?._count ?? 0; + const extractingCount = stats.find(s => s.status === 'extracting')?._count ?? 0; + const pendingCount = stats.find(s => s.status === 'pending')?._count ?? 0; + const processed = completedCount + errorCount; + + return { + taskId: task.id, + status: task.status, + totalCount: task.totalCount, + completedCount, + errorCount, + extractingCount, + pendingCount, + percent: task.totalCount > 0 + ? Math.round((processed / task.totalCount) * 100) + : 0, + createdAt: task.createdAt, + completedAt: task.completedAt, + }; + } + + /** + * 获取提取结果列表 + */ + async getResults(taskId: string) { + const results = await prisma.aslExtractionResult.findMany({ + where: { taskId }, + orderBy: { createdAt: 'asc' }, + select: { + id: true, + pkbDocumentId: true, + snapshotFilename: true, + status: true, + reviewStatus: true, + extractedData: true, + errorMessage: true, + processedAt: true, + createdAt: true, + }, + }); + return results; + } +} + +export const extractionService = new ExtractionService(); diff --git a/backend/src/modules/asl/extraction/services/ExtractionValidator.ts b/backend/src/modules/asl/extraction/services/ExtractionValidator.ts new file mode 100644 index 00000000..fd6b6e31 --- /dev/null +++ b/backend/src/modules/asl/extraction/services/ExtractionValidator.ts @@ -0,0 +1,166 @@ +/** + * 提取结果验证器 — fuzzyQuoteMatch 三级置信度 + * + * 对 LLM 返回的每个字段,检查其附带的 quote 是否能在原文中找到匹配。 + * 返回三级置信度:high / medium / low + * + * 搜索范围 = MinerU HTML (html-to-text 剥离标签) + 全文 Markdown 拼接 + */ + +import { logger } from '../../../../common/logging/index.js'; + +interface QuoteVerificationEntry { + confidence: 'high' | 'medium' | 'low'; + quote: string; + matchScore: number; +} + +type QuoteVerificationResult = Record>; + +class ExtractionValidatorImpl { + /** + * 构建搜索范围文本:MinerU HTML 纯文本 + 全文 Markdown + */ + buildQuoteSearchScope(fullMarkdown: string, tableHtmls: string[]): string { + const parts: string[] = []; + + for (const html of tableHtmls) { + parts.push(this.htmlToPlainText(html)); + } + + parts.push(fullMarkdown); + + return parts.join('\n'); + } + + /** + * 验证 extractedData 中所有字段的 quote 置信度 + * 兼容数组格式 [{key, value, quote}] 和扁平格式 {field: value, field_quote: "..."} + */ + verifyAllQuotes( + extractedData: Record, + searchScope: string, + ): QuoteVerificationResult { + const result: QuoteVerificationResult = {}; + const normalizedScope = this.normalize(searchScope); + + for (const [module, fields] of Object.entries(extractedData)) { + if (typeof fields !== 'object' || fields === null) continue; + result[module] = {}; + + if (Array.isArray(fields)) { + for (const item of fields) { + if (typeof item !== 'object' || !item || !item.key) continue; + const quote = item.quote; + if (!quote || typeof quote !== 'string') continue; + const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote); + result[module][item.key] = entry; + } + } else { + for (const [key, value] of Object.entries(fields)) { + if (key.endsWith('_quote')) continue; + + // Check for nested {value, quote} object + if (typeof value === 'object' && value !== null && 'quote' in value) { + const quote = (value as any).quote; + if (quote && typeof quote === 'string') { + const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote); + result[module][key] = entry; + } + continue; + } + + const quoteKey = `${key}_quote`; + const quote = fields[quoteKey]; + if (!quote || typeof quote !== 'string') continue; + const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote); + result[module][key] = entry; + } + } + } + + return result; + } + + /** + * 核心算法:fuzzyQuoteMatch + * + * 1. 精确子串匹配 → high (score = 1.0) + * 2. 忽略空白/标点后子串匹配 → high (score = 0.95) + * 3. 关键词覆盖率 ≥ 80% → medium + * 4. 关键词覆盖率 ≥ 50% → medium (lower score) + * 5. 覆盖率 < 50% → low + */ + fuzzyQuoteMatch( + rawScope: string, + normalizedScope: string, + llmQuote: string, + ): QuoteVerificationEntry { + const trimmedQuote = llmQuote.trim(); + if (trimmedQuote.length < 3) { + return { confidence: 'low', quote: trimmedQuote, matchScore: 0 }; + } + + // Exact substring match + if (rawScope.includes(trimmedQuote)) { + return { confidence: 'high', quote: trimmedQuote, matchScore: 1.0 }; + } + + // Normalized substring match (collapse whitespace, remove punctuation) + const normalizedQuote = this.normalize(trimmedQuote); + if (normalizedScope.includes(normalizedQuote)) { + return { confidence: 'high', quote: trimmedQuote, matchScore: 0.95 }; + } + + // Keyword overlap + const quoteTokens = this.tokenize(trimmedQuote); + if (quoteTokens.length === 0) { + return { confidence: 'low', quote: trimmedQuote, matchScore: 0 }; + } + + const matchedTokens = quoteTokens.filter((t) => normalizedScope.includes(t)); + const coverage = matchedTokens.length / quoteTokens.length; + + if (coverage >= 0.8) { + return { confidence: 'medium', quote: trimmedQuote, matchScore: coverage }; + } + if (coverage >= 0.5) { + return { confidence: 'medium', quote: trimmedQuote, matchScore: coverage }; + } + + return { confidence: 'low', quote: trimmedQuote, matchScore: coverage }; + } + + private normalize(text: string): string { + return text + .toLowerCase() + .replace(/[\s\u00A0]+/g, ' ') + .replace(/[^\w\s\u4e00-\u9fff]/g, '') + .trim(); + } + + private tokenize(text: string): string[] { + return this.normalize(text) + .split(/\s+/) + .filter((t) => t.length >= 2); + } + + /** + * 简易 HTML → 纯文本(不引入 html-to-text 依赖) + */ + private htmlToPlainText(html: string): string { + return html + .replace(//gi, '\n') + .replace(/<\/?(tr|td|th|thead|tbody|table|div|p|li|ul|ol|h[1-6])[^>]*>/gi, '\n') + .replace(/<[^>]+>/g, '') + .replace(/ /g, ' ') + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/\n{3,}/g, '\n\n') + .trim(); + } +} + +export const extractionValidator = new ExtractionValidatorImpl(); diff --git a/backend/src/modules/asl/extraction/services/PdfProcessingPipeline.ts b/backend/src/modules/asl/extraction/services/PdfProcessingPipeline.ts new file mode 100644 index 00000000..08195d89 --- /dev/null +++ b/backend/src/modules/asl/extraction/services/PdfProcessingPipeline.ts @@ -0,0 +1,119 @@ +/** + * PDF 预处理管线 — MinerU VLM 表格提取 + OSS Clean Data 缓存 + * + * 串行流程: + * 1. 检查 OSS 缓存 `{storageKey}_mineru_clean.json` + * 2. 命中 → 返回缓存 + * 3. 未命中 → 调用 MinerU Cloud API → 存 OSS + * 4. MinerU 超时(>3min) → 自动降级到纯文本 + */ + +import { storage } from '../../../../common/storage/index.js'; +import { getTableExtractionManager } from '../../../../common/document/tableExtraction/index.js'; +import type { ExtractionResult } from '../../../../common/document/tableExtraction/types.js'; +import { logger } from '../../../../common/logging/index.js'; + +const MINERU_TIMEOUT_MS = 3 * 60 * 1000; + +export interface PdfCleanData { + fullMarkdown: string; + tableHtmls: string[]; + engine: string; + cached: boolean; + duration?: number; +} + +class PdfProcessingPipelineImpl { + /** + * 获取 PDF 的 Clean Data(MinerU 结构化结果 + 全文 Markdown) + * + * @param storageKey OSS 上的原始 PDF key + * @param plainText PKB 已提取的纯文本(降级回退用) + * @returns 结构化 clean data + */ + async process(storageKey: string, plainText: string | null): Promise { + const cacheKey = `${storageKey}_mineru_clean.json`; + + // ── 1. 检查 OSS 缓存 ────────────────────── + try { + const exists = await storage.exists(cacheKey); + if (exists) { + const buf = await storage.download(cacheKey); + const data = JSON.parse(buf.toString('utf-8')) as PdfCleanData; + logger.info('[PdfPipeline] Cache HIT', { storageKey }); + return { ...data, cached: true }; + } + } catch { + // 缓存读取失败,继续走 MinerU + } + + // ── 2. MinerU 表格提取(带超时降级)──────── + const manager = getTableExtractionManager(); + const hasMinerU = !!process.env.MINERU_API_TOKEN; + + if (hasMinerU) { + try { + const pdfBuffer = await storage.download(storageKey); + const filename = storageKey.split('/').pop() || 'paper.pdf'; + + const result = await Promise.race([ + manager.extractTables(pdfBuffer, filename, { keepRaw: true }), + new Promise<'timeout'>((resolve) => + setTimeout(() => resolve('timeout'), MINERU_TIMEOUT_MS) + ), + ]); + + if (result === 'timeout') { + logger.warn('[PdfPipeline] MinerU timeout, fallback to plain text', { storageKey }); + return this.buildFallback(plainText); + } + + const tableHtmls = result.tables + .map((t) => t.rawHtml) + .filter((h): h is string => !!h); + + const cleanData: PdfCleanData = { + fullMarkdown: result.fullMarkdown || '', + tableHtmls, + engine: result.engine, + cached: false, + duration: result.duration, + }; + + // ── 3. 写入 OSS 缓存 ──────────────────── + try { + const buf = Buffer.from(JSON.stringify(cleanData), 'utf-8'); + await storage.upload(cacheKey, buf); + logger.info('[PdfPipeline] Cache STORED', { + storageKey, + size: `${(buf.length / 1024).toFixed(1)} KB`, + }); + } catch (e: any) { + logger.warn('[PdfPipeline] Failed to cache clean data', { error: e.message }); + } + + return cleanData; + } catch (e: any) { + logger.warn('[PdfPipeline] MinerU extraction failed, fallback', { + storageKey, + error: e.message, + }); + return this.buildFallback(plainText); + } + } + + // MinerU 未配置 → 直接降级 + return this.buildFallback(plainText); + } + + private buildFallback(plainText: string | null): PdfCleanData { + return { + fullMarkdown: plainText || '', + tableHtmls: [], + engine: 'plaintext-fallback', + cached: false, + }; + } +} + +export const pdfProcessingPipeline = new PdfProcessingPipelineImpl(); diff --git a/backend/src/modules/asl/extraction/services/PkbBridgeService.ts b/backend/src/modules/asl/extraction/services/PkbBridgeService.ts new file mode 100644 index 00000000..0adf9967 --- /dev/null +++ b/backend/src/modules/asl/extraction/services/PkbBridgeService.ts @@ -0,0 +1,65 @@ +import { pkbExportService, type PkbKnowledgeBaseExportDTO, type PkbDocumentExportDTO } from '../../../pkb/services/PkbExportService.js'; +import { logger } from '../../../../common/logging/index.js'; + +/** + * PKB 防腐层桥接服务(ASL 侧) + * + * ASL 绝不直接 import PKB 内部类型或查 pkb_schema。 + * 所有 PKB 数据访问通过此桥接 → pkbExportService(DTO)完成。 + */ + +export interface PkbDocumentDTO { + documentId: string; + storageKey: string; + filename: string; + extractedText: string | null; + fileSizeBytes: number; +} + +export interface PkbKnowledgeBaseDTO { + id: string; + name: string; + fileCount: number; +} + +class PkbBridgeServiceImpl { + /** + * 获取用户的知识库列表 + */ + async listKnowledgeBases(userId: string): Promise { + return pkbExportService.listKnowledgeBases(userId); + } + + /** + * 获取知识库内的 PDF 文档列表 + */ + async listPdfDocuments(kbId: string): Promise { + return pkbExportService.listPdfDocuments(kbId); + } + + /** + * 获取单篇文档详情(用于 API 层快照冻结) + */ + async getDocumentDetail(documentId: string): Promise<{ + documentId: string; + storageKey: string; + filename: string; + }> { + const doc = await pkbExportService.getDocumentForExtraction(documentId); + return { + documentId: doc.documentId, + storageKey: doc.storageKey, + filename: doc.filename, + }; + } + + /** + * 获取文档全文(Worker 使用,读取 extractedText) + */ + async getDocumentFullText(documentId: string): Promise { + const doc = await pkbExportService.getDocumentForExtraction(documentId); + return doc.extractedText; + } +} + +export const pkbBridgeService = new PkbBridgeServiceImpl(); diff --git a/backend/src/modules/asl/extraction/services/TemplateService.ts b/backend/src/modules/asl/extraction/services/TemplateService.ts new file mode 100644 index 00000000..d16d1b9b --- /dev/null +++ b/backend/src/modules/asl/extraction/services/TemplateService.ts @@ -0,0 +1,116 @@ +import { prisma } from '../../../../config/database.js'; +import { logger } from '../../../../common/logging/index.js'; + +/** + * 模板引擎服务(M1:基座模板只读;M3:自定义字段 CRUD) + */ +export class TemplateService { + /** + * 获取所有系统内置模板列表 + */ + async listSystemTemplates() { + const templates = await prisma.aslExtractionTemplate.findMany({ + where: { isSystem: true }, + orderBy: { code: 'asc' }, + }); + return templates; + } + + /** + * 获取单个系统模板详情 + */ + async getSystemTemplate(templateId: string) { + const template = await prisma.aslExtractionTemplate.findUnique({ + where: { id: templateId }, + }); + if (!template) throw new Error('Template not found'); + return template; + } + + /** + * 克隆系统模板为项目模板 + * 幂等:同 projectId + baseTemplateId 只允许一条 + */ + async cloneToProject(projectId: string, baseTemplateId: string, userId: string) { + const baseTemplate = await prisma.aslExtractionTemplate.findUnique({ + where: { id: baseTemplateId }, + }); + if (!baseTemplate) throw new Error('Base template not found'); + + const projectTemplate = await prisma.aslProjectTemplate.upsert({ + where: { + projectId_baseTemplateId: { + projectId, + baseTemplateId, + }, + }, + update: {}, + create: { + projectId, + userId, + baseTemplateId, + outcomeType: 'survival', + customFields: [], + isLocked: false, + }, + }); + + logger.info('[TemplateService] Cloned template to project', { + projectId, + baseTemplateCode: baseTemplate.code, + projectTemplateId: projectTemplate.id, + }); + + return projectTemplate; + } + + /** + * 获取项目模板(含基座模板信息) + */ + async getProjectTemplate(projectTemplateId: string) { + const pt = await prisma.aslProjectTemplate.findUnique({ + where: { id: projectTemplateId }, + include: { baseTemplate: true }, + }); + if (!pt) throw new Error('Project template not found'); + return pt; + } + + /** + * 锁定项目模板(提取启动后不可修改) + */ + async lockTemplate(projectTemplateId: string) { + await prisma.aslProjectTemplate.update({ + where: { id: projectTemplateId }, + data: { isLocked: true }, + }); + } + + /** + * M1:组装完整 Schema(基座字段 + _quote 对应字段) + * M3 升级:加入 customFields + */ + async assembleFullSchema(projectTemplateId: string) { + const pt = await this.getProjectTemplate(projectTemplateId); + const baseFields = pt.baseTemplate.baseFields as Record; + const outcomeType = pt.outcomeType; + + const schema: Record = {}; + + for (const [module, fields] of Object.entries(baseFields)) { + if (module.startsWith('outcomes_') && !module.endsWith(outcomeType) && module !== 'outcomes_' + outcomeType) { + continue; + } + schema[module] = fields; + } + + return { + projectTemplateId: pt.id, + baseTemplateCode: pt.baseTemplate.code, + outcomeType, + schema, + }; + } +} + +export const templateService = new TemplateService(); diff --git a/backend/src/modules/asl/extraction/workers/ExtractionAggregator.ts b/backend/src/modules/asl/extraction/workers/ExtractionAggregator.ts new file mode 100644 index 00000000..f9f5025a --- /dev/null +++ b/backend/src/modules/asl/extraction/workers/ExtractionAggregator.ts @@ -0,0 +1,71 @@ +import { prisma } from '../../../../config/database.js'; +import { logger } from '../../../../common/logging/index.js'; + +const ZOMBIE_TIMEOUT_MS = 30 * 60 * 1000; + +/** + * Aggregator — pg-boss schedule 每 2 分钟执行 + * + * 一人兼两职: + * 1. 僵尸清理:extracting > 30min → error + * 2. 收口判定:pending === 0 && extracting === 0 → Task completed + */ +export async function aggregatorHandler() { + const tasks = await prisma.aslExtractionTask.findMany({ + where: { status: 'processing' }, + }); + + if (tasks.length === 0) return; + + logger.info(`[Aggregator] Scanning ${tasks.length} processing task(s)`); + + for (const task of tasks) { + // ═══════════════════════════════════════════ + // 职责 1:僵尸清理 + // ═══════════════════════════════════════════ + const zombieResult = await prisma.aslExtractionResult.updateMany({ + where: { + taskId: task.id, + status: 'extracting', + updatedAt: { lt: new Date(Date.now() - ZOMBIE_TIMEOUT_MS) }, + }, + data: { + status: 'error', + errorMessage: '[Aggregator] Timeout after 30min, likely worker crash.', + }, + }); + if (zombieResult.count > 0) { + logger.warn(`[Aggregator] Cleaned ${zombieResult.count} zombie result(s) for task ${task.id}`); + } + + // ═══════════════════════════════════════════ + // 职责 2:聚合统计 — groupBy 一次查询 + // ═══════════════════════════════════════════ + const stats = await prisma.aslExtractionResult.groupBy({ + by: ['status'], + where: { taskId: task.id }, + _count: true, + }); + const pending = stats.find(s => s.status === 'pending')?._count ?? 0; + const extracting = stats.find(s => s.status === 'extracting')?._count ?? 0; + + // ═══════════════════════════════════════════ + // 职责 3:收口 — pending 和 extracting 都清零即完成 + // ═══════════════════════════════════════════ + if (pending === 0 && extracting === 0) { + const completed = stats.find(s => s.status === 'completed')?._count ?? 0; + const errored = stats.find(s => s.status === 'error')?._count ?? 0; + + const finalStatus = errored > 0 && completed === 0 ? 'failed' : 'completed'; + + await prisma.aslExtractionTask.update({ + where: { id: task.id }, + data: { status: finalStatus, completedAt: new Date() }, + }); + logger.info( + `[Aggregator] Task ${task.id} finished with status: ${finalStatus}. ` + + `Stats: ${completed} Success, ${errored} Failed, ${task.totalCount} Total.` + ); + } + } +} diff --git a/backend/src/modules/asl/extraction/workers/ExtractionSingleWorker.ts b/backend/src/modules/asl/extraction/workers/ExtractionSingleWorker.ts new file mode 100644 index 00000000..9ecb4355 --- /dev/null +++ b/backend/src/modules/asl/extraction/workers/ExtractionSingleWorker.ts @@ -0,0 +1,199 @@ +import { prisma } from '../../../../config/database.js'; +import { logger } from '../../../../common/logging/index.js'; +import { LLMFactory } from '../../../../common/llm/adapters/LLMFactory.js'; +import { templateService } from '../services/TemplateService.js'; +import { pkbBridgeService } from '../services/PkbBridgeService.js'; +import { pdfProcessingPipeline } from '../services/PdfProcessingPipeline.js'; +import { buildExtractionPrompt } from '../services/DynamicPromptBuilder.js'; +import { extractionValidator } from '../services/ExtractionValidator.js'; +import { extractionEventBus } from '../services/ExtractionEventBus.js'; +import type { Message } from '../../../../common/llm/adapters/types.js'; + +interface ExtractJobData { + resultId: string; + taskId: string; + pkbDocumentId: string; +} + +export class PermanentExtractionError extends Error { + constructor(message: string) { + super(message); + this.name = 'PermanentExtractionError'; + } +} + +/** + * v2.0 散装 Worker — 只管自己的 Result,绝不碰 Task 表 + * + * M1 阶段:纯文本降级提取(不接 MinerU) + * 调用 DeepSeek → JSON 结构化提取 + */ +class ExtractionSingleWorkerImpl { + async handle(job: { data: ExtractJobData }) { + const { resultId, taskId, pkbDocumentId } = job.data; + + // ═══════════════════════════════════════════ + // 幽灵重试守卫:只允许 pending → extracting + // ═══════════════════════════════════════════ + const lock = await prisma.aslExtractionResult.updateMany({ + where: { id: resultId, status: 'pending' }, + data: { status: 'extracting' }, + }); + if (lock.count === 0) { + logger.info('[Worker] Phantom retry skipped', { resultId }); + return { success: true, note: 'Phantom retry skipped' }; + } + + try { + // 获取 Result 记录(含 snapshotStorageKey) + const result = await prisma.aslExtractionResult.findUnique({ + where: { id: resultId }, + include: { + task: { + include: { projectTemplate: true }, + }, + }, + }); + if (!result) throw new PermanentExtractionError('Result record not found'); + + // 组装提取 Schema + const fullSchema = await templateService.assembleFullSchema(result.task.projectTemplateId); + + // 从 PKB 读 extractedText(降级回退用) + let plainText: string | null = null; + try { + plainText = await pkbBridgeService.getDocumentFullText(pkbDocumentId); + } catch (e: any) { + if (e.name === 'PkbDocumentNotFoundError') { + throw new PermanentExtractionError(`PKB document not found: ${pkbDocumentId}`); + } + throw e; + } + + // M2: MinerU 管线 — 结构化表格 + 全文 Markdown(自动 OSS 缓存 + 超时降级) + extractionEventBus.emit(taskId, { source: 'System', message: `开始处理: ${result.snapshotFilename}`, level: 'info' }); + + const storageKey = result.snapshotStorageKey; + const cleanData = await pdfProcessingPipeline.process(storageKey, plainText); + extractionEventBus.emit(taskId, { + source: cleanData.engine === 'plaintext-fallback' ? 'System' : 'MinerU', + message: cleanData.cached + ? `缓存命中 (${cleanData.engine})` + : `解析完成 (${cleanData.engine}, ${cleanData.duration ?? 0}ms)`, + level: 'info', + }); + + if (!cleanData.fullMarkdown && (!plainText || plainText.trim().length === 0)) { + throw new PermanentExtractionError('Document has no extracted text'); + } + + // M2: XML 隔离 Prompt(MinerU 表格优先级) + const { systemPrompt, userPrompt } = buildExtractionPrompt( + cleanData.fullMarkdown || plainText || '', + cleanData.tableHtmls, + fullSchema, + ); + + // 调用 LLM 进行结构化提取 + extractionEventBus.emit(taskId, { source: 'DeepSeek', message: '开始 LLM 结构化提取...', level: 'info' }); + const extractedData = await this.callLLM(systemPrompt, userPrompt); + extractionEventBus.emit(taskId, { source: 'DeepSeek', message: '提取完成', level: 'info' }); + + // M2: fuzzyQuoteMatch 三级置信度 + const searchScope = extractionValidator.buildQuoteSearchScope( + cleanData.fullMarkdown || plainText || '', + cleanData.tableHtmls, + ); + const quoteVerification = extractionValidator.verifyAllQuotes(extractedData, searchScope); + + // 只更新自己的 Result 行,绝不碰 Task 表! + await prisma.aslExtractionResult.update({ + where: { id: resultId }, + data: { + status: 'completed', + extractedData, + quoteVerification: quoteVerification as any, + processedAt: new Date(), + }, + }); + + extractionEventBus.emit(taskId, { source: 'System', message: `✅ ${result.snapshotFilename} 提取完成`, level: 'info' }); + + logger.info('[Worker] Extraction completed', { resultId, taskId }); + return { success: true }; + } catch (error: any) { + if (isPermanentError(error)) { + await prisma.aslExtractionResult.update({ + where: { id: resultId }, + data: { + status: 'error', + errorMessage: error.message?.slice(0, 1000) || 'Unknown permanent error', + }, + }); + logger.warn('[Worker] Permanent error', { resultId, error: error.message }); + return { success: false, note: 'Permanent error' }; + } + + // 临时错误:回退 status → pending,让下次重试能通过幽灵守卫 + await prisma.aslExtractionResult.update({ + where: { id: resultId }, + data: { status: 'pending' }, + }); + logger.warn('[Worker] Transient error, rollback to pending', { resultId, error: error.message }); + throw error; + } + } + + /** + * LLM 调用:DynamicPromptBuilder 提供的 system/user prompt → DeepSeek → 解析 JSON + */ + private async callLLM(systemPrompt: string, userPrompt: string) { + const llm = LLMFactory.getAdapter('deepseek-v3'); + + const messages: Message[] = [ + { role: 'system', content: systemPrompt }, + { role: 'user', content: userPrompt }, + ]; + + const response = await llm.chat(messages, { temperature: 0.1 }); + const content = response.content.trim(); + + const match = content.match(/\{[\s\S]*\}/); + if (!match) { + throw new PermanentExtractionError( + `LLM returned no valid JSON object. Content: ${content.slice(0, 200)}` + ); + } + + // 清洗 JSON 字符串值内的非法控制字符(LLM 常见问题: + // 在双引号字符串内嵌入裸换行/制表符,导致 JSON.parse 崩溃) + const sanitized = match[0].replace( + /("(?:[^"\\]|\\.)*")/g, + (str) => str.replace(/[\x00-\x1f\x7f]/g, (ch) => { + if (ch === '\n') return '\\n'; + if (ch === '\r') return '\\r'; + if (ch === '\t') return '\\t'; + return ''; + }), + ); + + try { + return JSON.parse(sanitized); + } catch (e: any) { + throw new PermanentExtractionError( + `LLM returned unparseable JSON: ${e.message}. Snippet: ${sanitized.slice(0, 200)}` + ); + } + } +} + +function isPermanentError(error: any): boolean { + if (error instanceof PermanentExtractionError) return true; + if (error.name === 'PkbDocumentNotFoundError') return true; + if (error.name === 'PdfCorruptedError') return true; + const status = error.status || error.statusCode; + if (status && status >= 400 && status < 500 && status !== 429) return true; + return false; +} + +export const extractionSingleWorker = new ExtractionSingleWorkerImpl(); diff --git a/backend/src/modules/asl/extraction/workers/index.ts b/backend/src/modules/asl/extraction/workers/index.ts new file mode 100644 index 00000000..623aba08 --- /dev/null +++ b/backend/src/modules/asl/extraction/workers/index.ts @@ -0,0 +1,42 @@ +import { jobQueue } from '../../../../common/jobs/index.js'; +import { PgBossQueue } from '../../../../common/jobs/PgBossQueue.js'; +import { extractionSingleWorker } from './ExtractionSingleWorker.js'; +import { aggregatorHandler } from './ExtractionAggregator.js'; +import { logger } from '../../../../common/logging/index.js'; + +/** + * 注册工具 3 全文提取的 Worker + Aggregator + * + * 使用 pg-boss 原生 API(Level 2 散装模式),不经过 PgBossQueue.process() + */ +export async function registerExtractionWorkers() { + const boss = (jobQueue as PgBossQueue).getNativeBoss(); + + // pg-boss v12 要求先 createQueue 再 work/schedule + await boss.createQueue('asl_extract_single').catch(() => {}); + await boss.createQueue('asl_extraction_aggregator').catch(() => {}); + + // ═══════════════════════════════════════════ + // 单一 Worker 队列:batchSize=1 使每个 Job 独立处理 + // ═══════════════════════════════════════════ + await boss.work<{ resultId: string; taskId: string; pkbDocumentId: string }>( + 'asl_extract_single', + { batchSize: 1 }, + async (jobs) => { + const job = jobs[0]; + if (job) { + await extractionSingleWorker.handle(job); + } + }, + ); + logger.info('[Extraction] Registered asl_extract_single worker'); + + // ═══════════════════════════════════════════ + // Aggregator 定时收口:每 2 分钟扫描 + // ═══════════════════════════════════════════ + await boss.schedule('asl_extraction_aggregator', '*/2 * * * *'); + await boss.work('asl_extraction_aggregator', async () => { + await aggregatorHandler(); + }); + logger.info('[Extraction] Registered asl_extraction_aggregator (schedule: */2 * * * *)'); +} diff --git a/backend/src/modules/asl/routes/index.ts b/backend/src/modules/asl/routes/index.ts index ecc3073a..871320d3 100644 --- a/backend/src/modules/asl/routes/index.ts +++ b/backend/src/modules/asl/routes/index.ts @@ -9,6 +9,7 @@ import * as screeningController from '../controllers/screeningController.js'; import * as fulltextScreeningController from '../fulltext-screening/controllers/FulltextScreeningController.js'; import * as researchController from '../controllers/researchController.js'; import * as deepResearchController from '../controllers/deepResearchController.js'; +import { extractionRoutes } from '../extraction/routes/index.js'; import { authenticate, requireModule } from '../../../common/auth/auth.middleware.js'; export async function aslRoutes(fastify: FastifyInstance) { @@ -107,6 +108,9 @@ export async function aslRoutes(fastify: FastifyInstance) { // V2.0 导出 Word fastify.get('/research/tasks/:taskId/export-word', { preHandler: [authenticate, requireModule('ASL')] }, deepResearchController.exportWord); + + // ==================== 工具 3:全文智能提取路由 ==================== + await fastify.register(extractionRoutes, { prefix: '/extraction' }); } diff --git a/backend/src/modules/pkb/services/PkbExportService.ts b/backend/src/modules/pkb/services/PkbExportService.ts new file mode 100644 index 00000000..9cde4046 --- /dev/null +++ b/backend/src/modules/pkb/services/PkbExportService.ts @@ -0,0 +1,96 @@ +import { prisma } from '../../../config/database.js'; + +/** + * PKB 数据导出服务(PKB 模块维护) + * + * ACL 防腐层出口:返回纯 DTO 对象,不暴露 Prisma 类型。 + * 消费方:ASL PkbBridgeService + */ + +export interface PkbDocumentExportDTO { + documentId: string; + storageKey: string; + filename: string; + extractedText: string | null; + fileSizeBytes: number; +} + +export interface PkbKnowledgeBaseExportDTO { + id: string; + name: string; + fileCount: number; +} + +class PkbExportServiceImpl { + async listKnowledgeBases(userId: string): Promise { + const kbs = await prisma.knowledgeBase.findMany({ + where: { userId }, + select: { + id: true, + name: true, + fileCount: true, + }, + orderBy: { updatedAt: 'desc' }, + }); + + return kbs.map(kb => ({ + id: kb.id, + name: kb.name, + fileCount: kb.fileCount, + })); + } + + async listPdfDocuments(kbId: string): Promise { + const docs = await prisma.document.findMany({ + where: { + kbId, + fileType: { in: ['pdf', 'application/pdf'] }, + }, + select: { + id: true, + storageKey: true, + filename: true, + extractedText: true, + fileSizeBytes: true, + }, + orderBy: { uploadedAt: 'desc' }, + }); + + return docs.map(doc => ({ + documentId: doc.id, + storageKey: doc.storageKey || '', + filename: doc.filename, + extractedText: doc.extractedText, + fileSizeBytes: Number(doc.fileSizeBytes || 0), + })); + } + + async getDocumentForExtraction(documentId: string): Promise { + const doc = await prisma.document.findUnique({ + where: { id: documentId }, + select: { + id: true, + storageKey: true, + filename: true, + extractedText: true, + fileSizeBytes: true, + }, + }); + + if (!doc) { + const err = new Error(`PKB Document not found: ${documentId}`); + (err as any).name = 'PkbDocumentNotFoundError'; + throw err; + } + + return { + documentId: doc.id, + storageKey: doc.storageKey || '', + filename: doc.filename, + extractedText: doc.extractedText, + fileSizeBytes: Number(doc.fileSizeBytes || 0), + }; + } +} + +export const pkbExportService = new PkbExportServiceImpl(); diff --git a/docs/03-业务模块/ASL-AI智能文献/00-模块当前状态与开发指南.md b/docs/03-业务模块/ASL-AI智能文献/00-模块当前状态与开发指南.md index 017f8459..eae2bd24 100644 --- a/docs/03-业务模块/ASL-AI智能文献/00-模块当前状态与开发指南.md +++ b/docs/03-业务模块/ASL-AI智能文献/00-模块当前状态与开发指南.md @@ -1,15 +1,14 @@ # AI智能文献模块 - 当前状态与开发指南 -> **文档版本:** v2.1 +> **文档版本:** v2.2 > **创建日期:** 2025-11-21 > **维护者:** AI智能文献开发团队 -> **最后更新:** 2026-02-24 🆕 **工具 3 V2.0 开发计划升级至 v2.0(散装派发 + Aggregator 架构,9 条研发红线)** +> **最后更新:** 2026-02-25 🆕 **工具 3 M1 骨架管线 + M2 HITL 工作台开发完成!** > **重大进展:** -> - 🆕 2026-02-24:工具 3 V2.0 架构升级!Fan-out → 散装派发 + Aggregator 轮询收口,通用模式指南 v1.1 沉淀 -> - 2026-02-23:工具 3 V2.0 开发计划 v1.5 完成!6 轮架构审查 + 5 份文档体系 -> - 🆕 2026-02-23:V2.0 核心功能完成!SSE 流式架构 + 段落化思考日志 + 引用链接可见化 -> - 🆕 2026-02-22:V2.0 前后端联调完成!瀑布流 UI + Markdown 渲染 + Word 导出 + 中文数据源测试 -> - 🆕 2026-02-22:V2.0 开发计划确认 + Unifuncs API 网站覆盖测试完成 +> - 🆕 2026-02-25:**工具 3 M1+M2 开发完成!** 散装派发+Aggregator 全链路、MinerU 集成、XML Prompt 隔离、fuzzyQuoteMatch 溯源、HITL 审核抽屉、Excel 导出、数据归一化修复 +> - 2026-02-24:工具 3 V2.0 架构升级!Fan-out → 散装派发 + Aggregator 轮询收口,通用模式指南 v1.1 沉淀 +> - 2026-02-23:V2.0 核心功能完成!SSE 流式架构 + 段落化思考日志 + 引用链接可见化 +> - 2026-02-22:V2.0 前后端联调完成!瀑布流 UI + Markdown 渲染 + Word 导出 + 中文数据源测试 > - 2026-01-18:智能文献检索(DeepSearch)MVP完成 - unifuncs API 集成 > **文档目的:** 反映模块真实状态,帮助新开发人员快速上手 @@ -33,15 +32,17 @@ AI智能文献模块是一个基于大语言模型(LLM)的文献筛选系统,用于帮助研究人员根据PICOS标准自动筛选文献。 ### 当前状态 -- **开发阶段**:🎉 V2.0 Deep Research 核心功能完成 + 🆕 工具 3 开发计划 v2.0 就绪 +- **开发阶段**:🎉 工具 3 M1+M2 开发完成,M3 待启动 - **已完成功能**: - ✅ 标题摘要初筛(Title & Abstract Screening)- 完整流程 - ✅ 全文复筛后端(Day 2-5)- LLM服务 + API + Excel导出 - ✅ **智能文献检索(DeepSearch)V1.x MVP** - unifuncs API 集成 - ✅ **Unifuncs API 网站覆盖测试** - 18 站点实测,9 个一级可用 - ✅ **🎉 Deep Research V2.0 核心功能** — SSE 流式架构 + 瀑布流 UI + HITL + Word 导出 -- **开发计划就绪(待编码)**: - - 📋 **🆕 工具 3 全文智能提取工作台 V2.0** — 开发计划 v2.0 完成(散装派发 + Aggregator 架构,9 条研发红线,M1/M2/M3 三阶段,预计 22 天) + - ✅ **🆕 工具 3 M1 骨架管线** — 散装派发+Aggregator 全链路、PKB ACL 防腐层、DeepSeek-V3 纯文本盲提、3 步极简前端 + - ✅ **🆕 工具 3 M2 HITL 工作台** — MinerU VLM 表格集成、XML Prompt 隔离、fuzzyQuoteMatch 溯源、SSE 实时日志、审核抽屉、Excel 宽表导出 +- **待开发**: + - 📋 **工具 3 M3 动态模板引擎** — 自定义字段 CRUD、Prompt 注入防护、E2E 测试 - **V2.0 已完成**: - ✅ **SSE 流式架构**:从 create_task/query_task 轮询改为 OpenAI Compatible SSE 流,实时推送 AI 思考过程 - ✅ **LLM 需求扩写**:DeepSeek-V3 将粗略输入扩写为结构化检索指令书(PICOS + MeSH) @@ -128,11 +129,11 @@ frontend-v2/src/modules/asl/ **通用能力指南**:`docs/02-通用能力层/04-DeepResearch引擎/01-Unifuncs DeepSearch API 使用指南.md` -### 🆕 工具 3 全文智能提取工作台 V2.0(2026-02-24 开发计划 v2.0 完成,待编码) +### 🆕 工具 3 全文智能提取工作台 V2.0(2026-02-25 M1+M2 开发完成) **功能定位:** 批量读取 PDF 全文 → 动态模板驱动 AI 结构化提取 → 人工 HITL 审核 → Excel 导出。是 ASL 证据整合 V2.0 三大工具中最复杂的一个。 -**开发计划状态:** ✅ v2.0 定稿(经 8+ 轮架构审查 + 架构转型:Fan-out → 散装派发 + Aggregator) +**开发状态:** ✅ M1 骨架管线完成 ✅ M2 HITL 工作台完成 📋 M3 动态模板引擎待启动 **v2.0 架构转型(2026-02-24):** @@ -176,11 +177,36 @@ frontend-v2/src/modules/asl/ **里程碑规划:** -| 里程碑 | 核心交付 | 时间 | -|--------|---------|------| -| M1 骨架管线 | 散装派发 + Aggregator 全链路 + PKB ACL + 纯文本盲提 + 极简前端 | Week 1 | -| M2 HITL 工作台 | MinerU + 审核抽屉 + SSE 日志 + Excel | Week 2-3 | -| M3 动态模板引擎 | 自定义字段 + Prompt 注入防护 + E2E 测试 | Week 4 | +| 里程碑 | 核心交付 | 时间 | 状态 | +|--------|---------|------|------| +| M1 骨架管线 | 散装派发 + Aggregator 全链路 + PKB ACL + 纯文本盲提 + 极简前端 | Week 1 | ✅ 完成 | +| M2 HITL 工作台 | MinerU + XML Prompt 隔离 + fuzzyQuoteMatch + SSE 日志 + 审核抽屉 + Excel 导出 | Week 2-3 | ✅ 完成 | +| M3 动态模板引擎 | 自定义字段 + Prompt 注入防护 + E2E 测试 | Week 4 | 📋 待启动 | + +**M1+M2 已完成的核心代码组件:** + +| 组件 | 文件 | 说明 | +|------|------|------| +| 散装派发 API | `ExtractionService.ts` | API 层直接 insert N 个独立 Job | +| 单文档 Worker | `ExtractionSingleWorker.ts` | PdfPipeline → DynamicPrompt → LLM → Validator | +| Aggregator 轮询 | `ExtractionAggregator.ts` | 定时 groupBy 收口 + 僵尸清理 | +| PKB 防腐层 | `PkbBridgeService.ts` → `PkbExportService.ts` | ACL 解耦,DTO 传输 | +| PDF 管线 | `PdfProcessingPipeline.ts` | MinerU VLM 表格 + PKB extractedText fallback | +| XML Prompt | `DynamicPromptBuilder.ts` | 扁平 JSON 输出模板 + HIGH_FIDELITY_TABLES 优先 | +| Quote 溯源 | `ExtractionValidator.ts` | fuzzyQuoteMatch 三档置信度 | +| SSE 事件 | `ExtractionEventBus.ts` | 内存事件总线 + 历史回放 | +| Excel 导出 | `ExtractionExcelExporter.ts` | flattenModuleData 归一化 + 双行表头宽表 | +| 模板服务 | `TemplateService.ts` | 3 套系统模板(RCT/Cohort/QC) | +| 审核抽屉 | `ExtractionDrawer.tsx` | Schema-driven 动态字段渲染 | +| 审核工作台 | `ExtractionWorkbench.tsx` | 全宽表格 + 700px Drawer | + +**M2 关键修复记录(2026-02-25):** +- 🔧 DynamicPromptBuilder:LLM 返回数组格式 → 重写 Prompt 明确要求扁平 key-value 格式 +- 🔧 ExcelExporter:新增 `flattenModuleData` 归一化,兼容 `[{key,value,quote}]` 数组格式 +- 🔧 ExtractionDrawer:从硬编码字段改为 schema-driven 动态渲染 +- 🔧 ExtractionValidator:兼容数组格式 quote 验证 +- 🔧 SSE 路由:Fastify 插件封装隔离,SSE 端点绕过 authenticate +- 🔧 LLM JSON 清洗:sanitize 非法控制字符防止 JSON.parse 崩溃 **9 条研发红线**:详见架构总纲文档 M1 红线表。 @@ -188,6 +214,37 @@ frontend-v2/src/modules/asl/ - 🚀 `docs/02-通用能力层/散装派发与轮询收口任务模式指南.md`(v1.1,Level 2 Cookbook) - 📖 `docs/02-通用能力层/分布式Fan-out任务模式开发指南.md`(v1.2,Level 3 参考) +**工具 3 新增 API 端点:** +```http +GET /api/v1/asl/extraction/templates # 系统模板列表 +POST /api/v1/asl/extraction/templates/clone # 克隆模板到项目 +POST /api/v1/asl/extraction/tasks # 创建提取任务(散装派发) +GET /api/v1/asl/extraction/tasks/:taskId/status # 任务状态(groupBy 聚合) +GET /api/v1/asl/extraction/tasks/:taskId/results # 提取结果列表 +GET /api/v1/asl/extraction/results/:resultId # 单条结果详情(含 schema) +PUT /api/v1/asl/extraction/results/:resultId/review # 审核(approved/rejected) +GET /api/v1/asl/extraction/tasks/:taskId/stream # SSE 实时日志(无需认证) +GET /api/v1/asl/extraction/tasks/:taskId/export # Excel 导出 +GET /api/v1/asl/extraction/pkb/knowledge-bases # PKB 知识库代理 +GET /api/v1/asl/extraction/pkb/knowledge-bases/:kbId/documents # PKB 文档代理 +``` + +**工具 3 新增前端路由:** +``` +/literature/extraction/setup # Step 1: 配置与启动 +/literature/extraction/progress/:taskId # Step 2: 进度监控 +/literature/extraction/workbench/:taskId # Step 3: 审核工作台 +``` + +**工具 3 新增数据库表(asl_schema):** + +| 表名 | 说明 | +|------|------| +| `extraction_templates` | 系统内置模板(RCT/Cohort/QC) | +| `extraction_project_templates` | 项目模板(克隆自系统模板,含 outcomeType/customFields) | +| `extraction_tasks` | 提取任务(idempotencyKey 幂等) | +| `extraction_results` | 单文档提取结果(extractedData JSON + quoteVerification + reviewStatus) | + ### 智能文献检索 DeepSearch V1.x(2026-01-18 MVP完成) **功能概述:** @@ -1434,28 +1491,28 @@ Drawer打开: <50ms ## 🎯 下一步开发计划 -### 当前(Deep Research V2.0 优化) -1. ⏳ **端到端回归测试**:完整流程测试(创建→扩写→确认→执行→结果→导出) -2. ⏳ **用户体验打磨**:加载动画、错误提示、边界情况处理 -3. ⏳ **中文检索优化**:中英文混合检索策略调优(建议分批搜索) -4. ⏳ **导出格式完善**:Word 模板美化、更多导出格式 +### 当前(工具 3 M3 动态模板引擎) +1. ⏳ **M3-1:自定义字段 CRUD**:用户可在项目模板上增删改自定义提取字段 +2. ⏳ **M3-2:Prompt 注入防护**:用户输入的字段 description 经过清洗再注入 Prompt +3. ⏳ **M3-3:E2E 完整测试**:从模板配置 → 提取 → 审核 → 导出的端到端自动化测试 +4. ⏳ **M3-4:模板版本管理**:支持锁定/解锁、版本快照 + +### 工具 3 后续优化 +1. ⏳ **RoB 自动评价增强**:Prompt 引导 LLM 基于方法学描述主动评价偏倚风险 +2. ⏳ **study_id 格式标准化**:强制 "FirstAuthor Year" 格式,后处理校验 +3. ⏳ **outcomes 模板匹配**:根据文献内容自动推荐 survival/continuous/dichotomous +4. ⏳ **缺失字段补充**:country、inclusion_criteria、primary_outcome 等(M3 自定义字段支持) ### 短期优化 -1. ⏳ Prompt 优化(需求扩写质量提升) -2. ⏳ 搜索历史管理(历史任务列表、重新搜索) -3. ⏳ 全文复筛前端 UI 开发 -4. ⏳ 标题摘要初筛 Prompt 优化(准确率 60% → 85%+) +1. ⏳ Deep Research V2.0 端到端回归测试 +2. ⏳ 搜索历史管理(历史任务列表) +3. ⏳ 标题摘要初筛 Prompt 优化(准确率 60% → 85%+) -### 中期(Month 2) -1. ⏳ 全文复筛功能完善 -2. ⏳ 证据图谱可视化 -3. ⏳ 用户自定义数据源 -4. ⏳ 生产环境部署 - -### 长期(Month 3+) -1. ⏳ 多语言检索策略自动优化 -2. ⏳ 批量文献检索 -3. ⏳ 成本控制和监控 +### 中期(Month 2-3) +1. ⏳ 工具 4(网状 Meta 分析)开发 +2. ⏳ 工具 5(证据质量评价 GRADE)开发 +3. ⏳ 生产环境部署 +4. ⏳ 证据图谱可视化 --- @@ -1465,14 +1522,15 @@ Drawer打开: <50ms --- -**最后更新**:2026-02-23(Deep Research V2.0 核心功能完成) +**最后更新**:2026-02-25(工具 3 M1+M2 开发完成 + 数据归一化修复) **文档状态**:✅ 反映真实状态 -**下次更新时机**:V2.0 端到端回归测试完成 或 全文复筛前端开发启动 +**下次更新时机**:工具 3 M3 动态模板引擎开发完成 -**本次更新内容**(v2.0): -- ✅ 更新当前状态:V2.0 核心功能开发完成(SSE 流式 + 瀑布流 UI + Word 导出) -- ✅ 新增 V2.0 完整架构表、技术决策、API 端点、关键文件列表 -- ✅ 新增 5 个精选数据源配置(替代 9 站全量展示) -- ✅ 更新下一步开发计划(V2.0 优化 + 短期/中期/长期) +**本次更新内容**(v2.2): +- ✅ 工具 3 M1 骨架管线完成:散装派发+Aggregator、PKB ACL、纯文本盲提、3步极简前端 +- ✅ 工具 3 M2 HITL 工作台完成:MinerU 集成、XML Prompt 隔离、fuzzyQuoteMatch、SSE 日志、审核抽屉、Excel 导出 +- ✅ M2 关键修复:DynamicPromptBuilder 扁平输出、ExcelExporter 数据归一化、Schema-driven 前端 +- ✅ 新增工具 3 API 端点(12 个)、前端路由(3 个)、数据库表(4 个) +- ✅ 更新下一步计划:M3 动态模板引擎 + RoB 增强 + 后续工具 4/5 diff --git a/docs/03-业务模块/RVW-稿件审查系统/06-开发记录/RVW V2.0 统计验证规则说明书(专家评审版).md b/docs/03-业务模块/RVW-稿件审查系统/06-开发记录/RVW V2.0 统计验证规则说明书(专家评审版).md new file mode 100644 index 00000000..69e795b1 --- /dev/null +++ b/docs/03-业务模块/RVW-稿件审查系统/06-开发记录/RVW V2.0 统计验证规则说明书(专家评审版).md @@ -0,0 +1,479 @@ +# RVW V2.0 统计验证规则说明书 + +> **文档目的:** 供统计学专家评审和确认 +> **版本:** v1.0 +> **日期:** 2026-02-18 +> **适用场景:** 医学期刊论文表格数据验证 + +--- + +## 目录 + +1. [概述](#概述) +2. [L1 算术验证规则](#l1-算术验证规则) +3. [L2 统计验证规则](#l2-统计验证规则) +4. [L2.5 一致性取证规则](#l25-一致性取证规则) +5. [容错阈值设置](#容错阈值设置) +6. [待评审问题清单](#待评审问题清单) + +--- + +## 概述 + +### 验证层级架构 + +| 层级 | 名称 | 目标 | 复杂度 | +|------|------|------|--------| +| **L1** | 算术验证 | 检查基础计算(加减乘除)是否正确 | 低 | +| **L2** | 统计验证 | 逆向验证统计检验结果是否合理 | 中 | +| **L2.5** | 一致性取证 | 启发式规则,发现潜在数据问题 | 高 | + +### 技术依赖 + +- **scipy.stats**:用于统计计算(t 分布、卡方分布、正态分布) +- **python-docx**:Word 文档表格提取 +- **正则表达式**:数据格式识别 + +--- + +## L1 算术验证规则 + +### 规则 L1-1:百分比计算验证 + +**应用场景:** 分类变量描述统计(如 n (%) 格式) + +**规则描述:** + +对于格式为 `n (p%)` 的单元格,验证: + +$$ +\text{calculated\_percent} = \frac{n}{N} \times 100 +$$ + +其中 N 为该列或该组的总数。 + +**判定条件:** + +| 条件 | 判定 | +|------|------| +| `|reported_percent - calculated_percent| > 0.1%` | ❌ Error | +| `|reported_percent - calculated_percent| ≤ 0.1%` | ✅ Pass | + +**示例:** + +| 原始数据 | N | 报告值 | 计算值 | 判定 | +|---------|---|--------|--------|------| +| 45 (50.0%) | 90 | 50.0% | 50.0% | ✅ Pass | +| 45 (52.0%) | 90 | 52.0% | 50.0% | ❌ Error (差 2%) | + +**识别模式(正则表达式):** + +``` +(\d+(?:\.\d+)?)\s*\(\s*(\d+(?:\.\d+)?)\s*%?\s*\) +``` + +**N 值获取策略:** + +1. 首先从表头识别标记为 "n"、"N"、"Total"、"合计" 的列 +2. 其次检查同行中是否有总数列 +3. 最后尝试从上下文推断 + +--- + +### 规则 L1-2:合计行验证 + +**应用场景:** 表格中的 Total/Sum/合计 行 + +**规则描述:** + +对于标记为 "Total"、"Sum"、"合计"、"总计" 的行,验证该行的每个数值列是否等于上方各行之和。 + +$$ +\text{Total}_{\text{col}} = \sum_{i=1}^{n-1} \text{Value}_{i, \text{col}} +$$ + +**判定条件:** + +| 条件 | 判定 | +|------|------| +| `|reported_total - calculated_sum| > 0.5` | ❌ Error | +| `|reported_total - calculated_sum| ≤ 0.5` | ✅ Pass | + +**示例:** + +| 分组 | 人数 | 判定 | +|------|------|------| +| A 组 | 30 | - | +| B 组 | 25 | - | +| C 组 | 45 | - | +| **合计** | **100** | ✅ Pass (30+25+45=100) | +| **合计** | **110** | ❌ Error (30+25+45=100≠110) | + +**识别关键词:** + +- 英文:total, sum, all +- 中文:合计, 总计, 总和 + +--- + +## L2 统计验证规则 + +### 规则 L2-1:卡方检验 P 值逆向验证 + +**应用场景:** 分类变量组间比较(基线特征表、疗效比较表) + +**规则描述:** + +从表格中提取报告的 χ² 值,根据自由度(df)反算 P 值,与报告的 P 值对比。 + +**计算公式:** + +$$ +P_{\text{calculated}} = 1 - F_{\chi^2}(\chi^2, df) +$$ + +其中 $F_{\chi^2}$ 是卡方分布的累积分布函数。 + +**自由度估计:** + +- **默认 df = 1**(适用于大多数 2×2 比较) +- 对于 r×c 表:df = (r-1) × (c-1) + +**判定条件:** + +| 条件 | 判定 | +|------|------| +| 显著性结论相反(P<0.05 vs P≥0.05) | ❌ Error | +| `|P_reported - P_calculated| > 0.05` | ⚠️ Warning | +| `|P_reported - P_calculated| ≤ 0.05` | ✅ Pass | + +**示例:** + +| χ² 值 | df | 报告 P | 计算 P | 判定 | +|-------|-----|--------|--------|------| +| 57.52 | 1 | 0.001 | 0.0000 | ✅ Pass(均显著) | +| 3.84 | 1 | 0.05 | 0.050 | ✅ Pass | +| 2.50 | 1 | 0.01 | 0.114 | ❌ Error(显著性结论相反) | + +**识别模式(正则表达式):** + +``` +# χ² 值识别 +(?:χ[²2]|[χx]2|2)\s*[=:]\s*(\d+\.?\d*) + +# P 值识别 +[Pp][\s\-值]*[=<>≤≥]\s*(\d+\.?\d*) +``` + +**⚠️ 待专家确认:** + +1. df=1 作为默认值是否合理? +2. 是否需要从表格结构推断实际自由度? + +--- + +### 规则 L2-2:T 检验 P 值逆向验证 + +**应用场景:** 连续变量两组比较(Mean±SD 格式) + +**规则描述:** + +从表格中提取两组的 Mean±SD 和样本量 n,使用独立样本 t 检验公式反算 P 值。 + +**计算公式:** + +$$ +t = \frac{|\bar{X}_1 - \bar{X}_2|}{\sqrt{\frac{SD_1^2}{n_1} + \frac{SD_2^2}{n_2}}} +$$ + +$$ +df = n_1 + n_2 - 2 +$$ + +$$ +P_{\text{calculated}} = 2 \times (1 - F_t(t, df)) +$$ + +其中 $F_t$ 是 t 分布的累积分布函数。 + +**判定条件:** + +| 条件 | 判定 | +|------|------| +| `|P_reported - P_calculated| > 0.05` | ❌ Error | +| `0.01 < |P_reported - P_calculated| ≤ 0.05` | ⚠️ Warning | +| `|P_reported - P_calculated| ≤ 0.01` | ✅ Pass | + +**示例:** + +| 组1 Mean±SD | 组2 Mean±SD | n1 | n2 | 报告 P | 计算 P | 判定 | +|-------------|-------------|-----|-----|--------|--------|------| +| 65.2±10.5 | 58.3±9.8 | 50 | 48 | 0.001 | 0.0007 | ✅ Pass | +| 45.0±12.0 | 44.5±11.5 | 30 | 30 | 0.001 | 0.86 | ❌ Error | + +**识别模式(正则表达式):** + +``` +# Mean±SD 格式 +(\d+\.?\d*)\s*[±\+\-]\s*(\d+\.?\d*) + +# 带括号的 SD 格式 +(\d+\.?\d*)\s*\(\s*(\d+\.?\d*)\s*\)(?!\s*%) +``` + +**样本量获取策略:** + +1. 从表头提取:`(n=50)`、`n=50`、`(50例)` +2. 从数据行提取:行首的 n 信息 +3. 从上下文推断 + +**⚠️ 待专家确认:** + +1. 使用 Welch's t-test(不假设方差齐性)是否更合适? +2. 当前假设为独立样本 t 检验,是否需要区分配对 t 检验? + +--- + +### 规则 L2-3:CI 与 P 值逻辑一致性 + +**应用场景:** 回归分析结果表(OR、HR、RR 及其 95% CI) + +**规则描述(黄金法则):** + +95% 置信区间与 P 值之间存在严格的逻辑对应关系: + +| 95% CI 与 1.0 的关系 | P 值要求 | +|--------------------|----------| +| CI 跨越 1.0 (如 0.8-1.2) | P **必须** ≥ 0.05(不显著) | +| CI 不跨越 1.0 (如 1.1-1.5) | P **必须** < 0.05(显著) | + +**违反此规则 = 数据逻辑矛盾** + +**判定条件:** + +| 场景 | CI | P 值 | 判定 | +|------|-----|------|------| +| 矛盾 1 | 0.8-1.2(跨越 1) | 0.03(<0.05) | ❌ Error | +| 矛盾 2 | 1.2-2.5(不跨越 1) | 0.10(≥0.05) | ❌ Error | +| 正确 1 | 0.8-1.2(跨越 1) | 0.45(≥0.05) | ✅ Pass | +| 正确 2 | 1.2-2.5(不跨越 1) | 0.01(<0.05) | ✅ Pass | + +**示例:** + +| OR | 95% CI | 报告 P | 判定 | +|----|--------|--------|------| +| 1.5 | 1.2-2.0 | 0.001 | ✅ Pass(CI 不跨越 1,P<0.05) | +| 0.9 | 0.7-1.1 | 0.30 | ✅ Pass(CI 跨越 1,P≥0.05) | +| 1.3 | 0.9-1.8 | 0.02 | ❌ Error(CI 跨越 1,但 P<0.05) | + +**识别模式(正则表达式):** + +``` +# OR/HR/RR 识别 +(?:OR|HR|RR)\s*[=:]\s*(\d+\.?\d*) + +# CI 识别(多种格式) +[\(\[]\s*(\d+\.?\d*)\s*[-–—,;]\s*(\d+\.?\d*)\s*[\)\]] +95%?\s*CI\s*[:\s]+(\d+\.?\d*)\s*[-–—,;to]+\s*(\d+\.?\d*) +``` + +**⚠️ 待专家确认:** + +1. 此规则仅适用于比值指标(OR、HR、RR),对于回归系数(β)是否需要调整为 CI 跨越 0? +2. 90% CI 和 95% CI 的判定标准应如何区分? + +--- + +## L2.5 一致性取证规则 + +### 规则 L2.5-1:SE 三角验证 + +**应用场景:** Logistic 回归、Cox 回归等报告 OR/HR/RR、95% CI 和 P 值的表格 + +**规则描述:** + +利用 OR/HR 与 95% CI 的数学关系,反推标准误(SE),再计算 Z 值和 P 值,与报告的 P 值对比。 + +**计算公式:** + +$$ +SE = \frac{\ln(CI_{\text{upper}}) - \ln(CI_{\text{lower}})}{3.92} +$$ + +(3.92 = 2 × 1.96,对应 95% CI 的 Z 临界值) + +$$ +Z = \frac{|\ln(OR)|}{SE} +$$ + +$$ +P_{\text{calculated}} = 2 \times (1 - \Phi(Z)) +$$ + +其中 $\Phi$ 是标准正态分布的累积分布函数。 + +**判定条件:** + +| 条件 | 判定 | +|------|------| +| `|P_reported - P_calculated| > 0.05` | ❌ Error | +| `0.01 < |P_reported - P_calculated| ≤ 0.05` | ⚠️ Warning | +| `|P_reported - P_calculated| ≤ 0.01` | ✅ Pass | + +**示例:** + +| OR | 95% CI | 报告 P | 计算 SE | 计算 Z | 计算 P | 判定 | +|----|--------|--------|---------|--------|--------|------| +| 2.0 | 1.2-3.3 | 0.008 | 0.258 | 2.69 | 0.007 | ✅ Pass | +| 1.5 | 1.0-2.25 | 0.001 | 0.206 | 1.97 | 0.049 | ❌ Error | + +**⚠️ 待专家确认:** + +1. SE 计算公式是否准确?是否需要考虑 CI 的不对称情况? +2. 对于 HR(风险比),此公式是否同样适用? + +--- + +### 规则 L2.5-2:SD > Mean 检查 + +**应用场景:** 连续变量描述统计 + +**规则描述:** + +对于**已知为正值的指标**(如年龄、体重、血压、实验室指标),标准差(SD)大于均值(Mean)通常是异常的,可能暗示: + +1. 数据录入错误 +2. SD 与 SEM 混淆 +3. 数据分布异常 + +**计算公式:** + +$$ +CV = \frac{SD}{Mean} +$$ + +若 CV > 100%(即 SD > Mean),则触发警告。 + +**判定条件:** + +| 场景 | 判定 | +|------|------| +| 已知正值指标 + SD > Mean | ❌ Error | +| 未知指标类型 + SD > Mean | ⚠️ Warning(建议核查) | + +**已知正值指标关键词:** + +| 类别 | 指标 | +|------|------| +| 人口学 | 年龄、身高、体重、BMI | +| 生命体征 | 收缩压、舒张压、心率、脉搏 | +| 血常规 | WBC、RBC、HGB、PLT | +| 生化 | 肌酐、尿素氮、血糖、ALT、AST、胆红素 | +| 其他 | 费用、时间、持续时间 | + +**例外情况:** + +- **差值指标**:如"治疗前后变化值"可正可负 +- **某些偏态分布指标**:如住院天数(可能存在极端值) + +**示例:** + +| 指标 | Mean±SD | CV | 判定 | +|------|---------|-----|------| +| 年龄 | 65±12 | 18.5% | ✅ Pass | +| 体重 | 70±15 | 21.4% | ✅ Pass | +| 年龄 | 30±45 | 150% | ❌ Error(SD>Mean) | +| 变化值 | 5±12 | 240% | ⚠️ Warning(可能合理) | + +**⚠️ 待专家确认:** + +1. CV > 100% 作为阈值是否合理? +2. 是否有其他需要排除的例外情况? + +--- + +## 容错阈值设置 + +### 当前阈值配置 + +| 参数 | 值 | 说明 | +|------|-----|------| +| `PVALUE_ERROR_THRESHOLD` | 0.05 | P 值差异 > 此值 → Error | +| `PVALUE_WARNING_THRESHOLD` | 0.01 | P 值差异 > 此值 → Warning | +| `DEFAULT_TOLERANCE_PERCENT` | 0.1% | 百分比容错 ±0.1% | +| `CI_RELATIVE_TOLERANCE` | 2% | CI 端点相对误差 ±2% | +| `STAT_RELATIVE_TOLERANCE` | 5% | t/χ² 值相对误差 ±5% | + +### 阈值设置依据 + +1. **P 值阈值 0.05**:当计算的 P 值与报告的 P 值差异超过 0.05 时,可能导致显著性结论相反,属于严重问题 +2. **P 值阈值 0.01**:0.01-0.05 之间的差异可能是舍入误差或计算方法差异,给予警告 +3. **百分比容错 0.1%**:考虑四舍五入误差,允许 ±0.1% 的偏差 + +**⚠️ 待专家确认:** + +1. 上述阈值是否合理? +2. 是否需要针对不同检验方法设置不同阈值? + +--- + +## 待评审问题清单 + +### 高优先级(请专家重点关注) + +| # | 问题 | 当前处理 | 请确认 | +|---|------|----------|--------| +| 1 | 卡方检验默认 df=1 | 适用于 2×2 比较 | 是否合理?如何推断多组比较? | +| 2 | T 检验使用 Welch's 还是 Student's | 当前用合并方差公式 | 是否应默认使用 Welch's? | +| 3 | CI vs P 值规则中的 "1.0" | 仅适用于比值指标 | 回归系数应使用 0? | +| 4 | SE 三角公式的准确性 | 基于正态近似 | 对于小样本是否适用? | +| 5 | SD > Mean 的阈值 | CV > 100% 触发 | 是否过于严格? | + +### 中优先级(功能扩展) + +| # | 问题 | 说明 | +|---|------|------| +| 6 | 配对 T 检验验证 | 当前仅支持独立样本 | +| 7 | ANOVA P 值验证 | 多组比较 | +| 8 | 非参数检验验证 | Mann-Whitney、Wilcoxon | +| 9 | 相关性分析验证 | Pearson、Spearman | + +### 低优先级(边缘情况) + +| # | 问题 | 说明 | +|---|------|------| +| 10 | 90% CI vs 95% CI 区分 | 当前假设都是 95% CI | +| 11 | 单侧检验 vs 双侧检验 | 当前假设都是双侧 | +| 12 | Bonferroni 校正后的 P 值 | 多重比较场景 | + +--- + +## 附录:识别模式汇总 + +### 正则表达式清单 + +```python +# 1. 百分比格式 n (%) +PERCENT_PATTERN = r"(\d+(?:\.\d+)?)\s*\(\s*(\d+(?:\.\d+)?)\s*%?\s*\)" + +# 2. P 值 +PVALUE_PATTERN = r"[Pp][\s\-值]*[=<>≤≥]\s*(\d+\.?\d*)" + +# 3. 卡方值 +CHI_SQUARE_PATTERN = r"(?:χ[²2]|[χx]2|2)\s*[=:]\s*(\d+\.?\d*)" + +# 4. Mean±SD +MEAN_SD_PATTERN = r"(\d+\.?\d*)\s*[±\+\-]\s*(\d+\.?\d*)" + +# 5. OR/HR/RR +EFFECT_SIZE_PATTERN = r"(?:OR|HR|RR)\s*[=:]\s*(\d+\.?\d*)" + +# 6. 95% CI +CI_PATTERN = r"[\(\[]\s*(\d+\.?\d*)\s*[-–—,;]\s*(\d+\.?\d*)\s*[\)\]]" +``` + +--- + +**文档版本:** v1.0 +**创建日期:** 2026-02-18 +**待更新:** 专家评审反馈后更新 diff --git a/docs/03-业务模块/SSA-智能统计分析/07-统计专家配置/统计专家配置清单与准备指南.md b/docs/03-业务模块/SSA-智能统计分析/07-统计专家配置/统计专家配置清单与准备指南.md new file mode 100644 index 00000000..71f77948 --- /dev/null +++ b/docs/03-业务模块/SSA-智能统计分析/07-统计专家配置/统计专家配置清单与准备指南.md @@ -0,0 +1,586 @@ +# SSA 智能统计分析 — 统计专家配置清单与准备指南 + +> **文档版本:** v1.0 +> **创建日期:** 2026-02-23 +> **目标读者:** 统计学专家 / 生物统计师 +> **文档目的:** 系统性梳理 QPER 架构中所有需要统计学专家审核/配置的内容,便于专家提前准备 + +--- + +## 📋 目录 + +1. [系统架构概览](#1-系统架构概览) +2. [配置总览清单(一览表)](#2-配置总览清单) +3. [A 类:统计方法决策表](#3-a-类统计方法决策表) +4. [B 类:分析流程模板](#4-b-类分析流程模板) +5. [C 类:R 统计工具](#5-c-类r-统计工具) +6. [D 类:工具参数约束表](#6-d-类工具参数约束表) +7. [E 类:LLM Prompt 模板](#7-e-类llm-prompt-模板) +8. [F 类:意图识别规则](#8-f-类意图识别规则) +9. [G 类:统计工具注册表](#9-g-类统计工具注册表) +10. [当前状态与待办事项](#10-当前状态与待办事项) +11. [专家审核工作流程建议](#11-专家审核工作流程建议) + +--- + +## 1. 系统架构概览 + +### QPER 四层架构 + +系统采用 **QPER** 四层流水线架构处理用户的统计分析请求: + +``` +用户:"比较两组血压有没有差别" + │ + ▼ +┌─ Q · Query(意图理解)──────────────────────────┐ +│ LLM 解析用户意图 → 结构化查询 │ +│ 需要配置:意图识别规则(F)、意图 Prompt(E) │ +│ 输出:goal=comparison, Y=SBP, X=Group, design=ind │ +└──────────────────────┬─────────────────────────────┘ + ▼ +┌─ P · Planner(方法规划)───────────────────────────┐ +│ 决策表匹配 → 流程模板填充 │ +│ 需要配置:决策表(A)、流程模板(B)、工具注册表(G) │ +│ 输出:WorkflowPlan [step1: 描述统计, step2: T检验] │ +└──────────────────────┬─────────────────────────────┘ + ▼ +┌─ E · Execute(R引擎执行)──────────────────────────┐ +│ 调用 R 工具执行统计计算 │ +│ 需要配置:R 工具脚本(C)、参数约束(D) │ +│ 输出:StepResult[](含 P 值、统计量、效应量等) │ +└──────────────────────┬─────────────────────────────┘ + ▼ +┌─ R · Reflection(论文级结论)──────────────────────┐ +│ LLM 基于真实 R 引擎结果生成论文结论 │ +│ 需要配置:结论 Prompt(E) │ +│ 输出:ConclusionReport(含摘要、方法学、局限性) │ +└────────────────────────────────────────────────────┘ +``` + +### 核心原则 + +| 原则 | 说明 | +|------|------| +| **LLM 不做计算** | LLM 只负责意图理解、方案规划、结果解读,所有统计计算由 R 引擎完成 | +| **白盒透明** | 用户可以看到每一步做了什么、为什么、R 代码是什么 | +| **自动降级** | 不满足参数检验前提条件时,自动切换非参数方法 | +| **可审计** | 生成可在本地运行的 R 代码,支持结果复现 | + +--- + +## 2. 配置总览清单 + +> 🟢 = 已配置完成 | 🟡 = 已配置但需专家审核/完善 | 🔴 = 未配置 + +| 类别 | 配置项 | 文件路径 | 当前状态 | 专家需做什么 | +|------|--------|----------|---------|-------------| +| **A. 决策表** | 统计方法选择规则(11 条) | `config/decision_tables.json` | 🟡 | 审核匹配规则、降级条件、补充缺失场景 | +| **B. 流程模板** | 分析步骤编排(5 个模板) | `config/flow_templates.json` | 🟡 | 审核步骤合理性、补充新模板 | +| **C. R 工具** | 统计计算脚本(12 个) | `r-statistics-service/tools/*.R` | 🟡 | 审核统计方法正确性、护栏阈值 | +| **D. 参数约束** | 变量类型/水平约束(12 工具) | `config/tool_param_constraints.json` | 🟡 | 审核约束条件(minLevels/maxLevels 等) | +| **E. Prompt** | LLM 提示词(12 个) | 数据库 + seed 脚本 | 🟡 | 审核统计术语准确性、Few-Shot 示例 | +| **F. 意图规则** | 意图识别关键词(5 类) | `config/intent_rules.json` | 🟡 | 补充统计领域关键词 | +| **G. 工具注册表** | 工具元信息(12 工具) | `config/tools_registry.json` | 🟡 | 审核工具描述、参数说明、前提条件 | + +--- + +## 3. A 类:统计方法决策表 + +**文件:** `backend/src/modules/ssa/config/decision_tables.json` + +### 作用 + +当系统理解了用户意图后(Q 层输出 goal + outcomeType + predictorType + design),通过此表进行 **四维匹配**,选出最适合的统计方法。 + +### 当前已配置的 11 条规则 + +| 规则 ID | 分析目标 | 结局类型 | 自变量类型 | 设计 | 主方法 | 降级方法 | 降级触发条件 | +|---------|---------|---------|-----------|------|--------|---------|-------------| +| `DIFF_CONT_BIN_IND` | comparison | continuous | binary | independent | T 检验 | Mann-Whitney | Shapiro-Wilk P<0.05 | +| `DIFF_CONT_BIN_PAIRED` | comparison | continuous | binary | paired | 配对 T 检验 | Wilcoxon | 差值 Shapiro-Wilk P<0.05 | +| `DIFF_CONT_MULTI_IND` | comparison | continuous | categorical | independent | ANOVA | Kruskal-Wallis | Shapiro-Wilk P<0.05(内部自动切换) | +| `DIFF_CAT_CAT_IND` | comparison | categorical | categorical | independent | 卡方检验 | Fisher | 期望频数<5 超过 20% 且 2×2 表 | +| `DIFF_CAT_CAT_SMALL` | comparison | categorical | binary | independent | Fisher | 无 | 直接使用 | +| `ASSOC_CONT_CONT` | correlation | continuous | continuous | * | 相关分析 | 无 | Pearson/Spearman 自动选择 | +| `ASSOC_CAT_ANY` | correlation | categorical | * | * | 卡方检验 | Fisher | 期望频数<5 超过 20% | +| `PRED_BIN_ANY` | regression | binary | * | * | Logistic 回归 | 无 | — | +| `PRED_CONT_ANY` | regression | continuous | * | * | 线性回归 | 无 | — | +| `DESC_ANY` | descriptive | * | * | * | 描述性统计 | 无 | — | +| `COHORT_STUDY` | cohort_study | binary | * | * | 描述统计 | 无 | 对应队列研究模板 | + +### 🔍 专家需审核的问题 + +1. **降级条件是否正确?** + - Shapiro-Wilk P<0.05 作为正态性检验阈值是否合理? + - 期望频数<5 超过 20% 的 Fisher 切换标准是否合理? + +2. **缺失的分析场景:** + - 有序分类变量(ordinal)的比较方法? + - 多因素方差分析(多个自变量)? + - 生存分析(Kaplan-Meier、Cox 回归)? + - 重复测量设计? + - 倾向性评分匹配? + - 交叉表的多重比较? + +3. **优先级是否合理?** + - 当多条规则同时匹配时,`priority` 值高的优先,需确认排序逻辑 + +4. **队列研究的 outcomeType 限定为 binary 是否合适?** + - 如果结局变量是连续型(如生存时间),是否需要新规则? + +--- + +## 4. B 类:分析流程模板 + +**文件:** `backend/src/modules/ssa/config/flow_templates.json` + +### 作用 + +决策表选出方法后,通过流程模板确定 **执行步骤的编排顺序**。每个模板定义了"先做什么、再做什么"。 + +### 当前已配置的 5 个模板 + +#### 模板 1:`standard_analysis`(标准分析流程) +``` +步骤 1: 描述性统计(固定) +步骤 2: 主分析(动态填入,如 T 检验) +步骤 3: 敏感性分析(动态填入降级方法,条件:有降级方法时才执行) +``` +**适用:** 两组比较、多组比较、卡方检验、相关分析等 + +#### 模板 2:`paired_analysis`(配对设计分析) +``` +步骤 1: 描述性统计(固定) +步骤 2: 配对检验(动态填入,如配对 T 检验) +``` +**适用:** 前后对比、配对设计 + +#### 模板 3:`regression_analysis`(回归建模) +``` +步骤 1: 描述性统计(固定) +步骤 2: 多因素回归(动态填入,如 Logistic 或线性回归) +``` +**适用:** 多因素分析 + +#### 模板 4:`descriptive_only`(纯描述统计) +``` +步骤 1: 描述性统计(固定) +``` +**适用:** 仅需数据概况 + +#### 模板 5:`cohort_study_standard`(经典队列研究) +``` +步骤 1: 表1 — 组间基线特征比较(ST_BASELINE_TABLE,group_var=分组变量) +步骤 2: 表2 — 结局指标单因素分析(ST_BASELINE_TABLE,group_var=结局变量) +步骤 3: 表3 — 多因素 Logistic 回归(ST_LOGISTIC_BINARY,含 EPV 截断) +``` +**适用:** 队列研究 Table 1→2→3 + +### 🔍 专家需审核的问题 + +1. **标准分析是否都需要"敏感性分析"步骤?** + - 当主分析与敏感性分析结论不一致时,系统已有冲突处理准则 + +2. **队列研究模板是否完整?** + - 表1 基线比较 → 表2 单因素筛选 → 表3 多因素回归,这个流程是否标准? + - 是否需要增加:表2 → 表3 之间的变量筛选逻辑(如 P<0.1 纳入多因素)? + - 表3 的 EPV(Events Per Variable)截断规则:当前 `epv_capped_predictors` 的计算方式是否合理? + +3. **需要新增的流程模板:** + - 病例对照研究模板? + - 横断面调查模板? + - 生存分析模板(KM + Log-rank + Cox)? + - 诊断试验模板(敏感性、特异性、ROC)? + - 一致性分析模板(Kappa、ICC)? + +--- + +## 5. C 类:R 统计工具 + +**目录:** `r-statistics-service/tools/` + +### 作用 + +每个 R 工具实现一种统计方法的完整计算。系统通过 HTTP API 调用 R 工具,传入数据和参数,返回标准化的结果块(Block-based 输出)。 + +### 当前已实现的 12 个 R 工具 + +| 工具代码 | 方法名称 | 文件 | 内置护栏 | 自动降级 | +|---------|---------|------|---------|---------| +| `ST_DESCRIPTIVE` | 描述性统计 | `descriptive.R` | NA 安全处理 | — | +| `ST_T_TEST_IND` | 独立样本 T 检验 | `t_test_ind.R` | 正态性(Shapiro-Wilk)、方差齐性(Levene) | → Mann-Whitney | +| `ST_T_TEST_PAIRED` | 配对 T 检验 | `t_test_paired.R` | 差值正态性(Shapiro-Wilk) | → Wilcoxon | +| `ST_MANN_WHITNEY` | Mann-Whitney U 检验 | `mann_whitney.R` | 样本量检查 | — | +| `ST_WILCOXON` | Wilcoxon 符号秩检验 | `wilcoxon.R` | 配对数据完整性 | — | +| `ST_CHI_SQUARE` | 卡方检验 | `chi_square.R` | 期望频数检查 | → Fisher | +| `ST_FISHER` | Fisher 精确检验 | `fisher.R` | 2×2 表检查 | — | +| `ST_ANOVA_ONE` | 单因素方差分析 | `anova_one.R` | 正态性、方差齐性(Bartlett) | → Kruskal-Wallis | +| `ST_CORRELATION` | 相关分析 | `correlation.R` | 正态性检测 | Pearson↔Spearman 自动 | +| `ST_LOGISTIC_BINARY` | 二元 Logistic 回归 | `logistic_binary.R` | 多重共线性(VIF)、EPV 检查 | — | +| `ST_LINEAR_REG` | 线性回归 | `linear_reg.R` | 残差正态性、多重共线性(VIF) | — | +| `ST_BASELINE_TABLE` | 基线特征表 | `baseline_table.R` | 变量类型自动判断 | gtsummary 自动选方法 | + +### 每个 R 工具的标准输出结构 + +``` +{ + "success": true, + "tool_code": "ST_T_TEST_IND", + "blocks": [ + { "type": "key_value", "title": "检验结果", "items": {...} }, + { "type": "table", "title": "组间比较", "headers": [...], "rows": [...] }, + { "type": "chart", "title": "箱线图", "chartType": "boxplot", "base64": "..." }, + { "type": "text", "title": "结论", "content": "..." } + ], + "guardrail_notes": ["正态性检验通过 (Shapiro-Wilk P=0.23)"], + "reproducible_code": "# 可复现 R 代码\nlibrary(...)..." +} +``` + +### 🔍 专家需审核的问题 + +1. **护栏阈值是否合理?** + - 正态性检验:Shapiro-Wilk P<0.05 → 切换非参数,这个阈值是否标准? + - 方差齐性:Levene/Bartlett P<0.05 → 使用 Welch 校正,是否合理? + - 期望频数:<5 超过 20% → Fisher,标准是否准确? + - EPV(Events Per Variable)≥10 的门槛是否需要调整? + +2. **统计方法实现是否正确?** + - T 检验:Welch T 检验 vs Student T 检验的选择逻辑 + - ANOVA 事后比较方法:Tukey HSD vs Bonferroni? + - Logistic 回归:变量选择策略(Enter 法 / Forward / Backward)? + - 相关分析:自动选择 Pearson vs Spearman 的依据 + +3. **缺失的 R 工具(需开发新工具时才需要配置):** + - Kaplan-Meier 生存分析 + - Cox 比例风险回归 + - 重复测量 ANOVA + - 混合效应模型 + - ROC 曲线与 AUC + - Kappa/ICC 一致性分析 + - 倾向性评分匹配 + +--- + +## 6. D 类:工具参数约束表 + +**文件:** `backend/src/modules/ssa/config/tool_param_constraints.json` + +### 作用 + +定义每个统计工具对输入参数的**变量类型要求**,用于: +- 前端变量选择器的智能提示和⚠️警告标记 +- 后端 API 的参数校验(防火墙) +- 帮助临床医生正确选择变量 + +### 当前已配置(12 个工具,全覆盖) + +| 工具 | 参数名 | 选择模式 | 类型要求 | 水平限制 | 提示语 | +|------|--------|---------|---------|---------|--------| +| **T 检验** | `group_var` | 单选 | categorical | maxLevels=2 | T检验要求二分类分组变量 | +| | `value_var` | 单选 | numeric | — | T检验要求连续型因变量 | +| **Mann-Whitney** | `group_var` | 单选 | categorical | maxLevels=2 | 要求二分类分组变量 | +| | `value_var` | 单选 | numeric | — | 要求连续型因变量 | +| **配对 T 检验** | `before_var` | 单选 | numeric | — | 前测变量应为连续型 | +| | `after_var` | 单选 | numeric | — | 后测变量应为连续型 | +| **Wilcoxon** | `before_var` | 单选 | numeric | — | 前测变量应为连续型 | +| | `after_var` | 单选 | numeric | — | 后测变量应为连续型 | +| **卡方检验** | `var1` | 单选 | categorical | — | 要求分类变量 | +| | `var2` | 单选 | categorical | — | 要求分类变量 | +| **Fisher** | `var1` | 单选 | categorical | — | 要求分类变量 | +| | `var2` | 单选 | categorical | — | 要求分类变量 | +| **相关分析** | `var_x` | 单选 | numeric | — | 要求连续型变量 | +| | `var_y` | 单选 | numeric | — | 要求连续型变量 | +| **Logistic 回归** | `outcome_var` | 单选 | categorical | maxLevels=2 | 要求二分类结局变量 | +| | `predictors` | 多选 | any | — | 预测变量 | +| | `confounders` | 多选 | any | — | 混杂因素(可选) | +| **线性回归** | `outcome_var` | 单选 | numeric | — | 要求连续型结局变量 | +| | `predictors` | 多选 | any | — | 预测变量 | +| | `confounders` | 多选 | any | — | 混杂因素(可选) | +| **ANOVA** | `group_var` | 单选 | categorical | minLevels=3 | 要求3组及以上分组变量 | +| | `value_var` | 单选 | numeric | — | 要求连续型因变量 | +| **基线表** | `group_var` | 单选 | categorical | minLevels=2, maxLevels=5 | 需要分类分组变量 | +| | `analyze_vars` | 多选 | any | — | 选择需要分析的变量 | +| **描述统计** | `variables` | 多选 | any | — | 选择需要描述的变量 | +| | `group_var` | 单选 | categorical | — | 分组变量(可选) | + +### 🔍 专家需审核的问题 + +1. **水平限制是否准确?** + - 基线表 `maxLevels=5` 是否合理?分组变量超过 5 组的情况是否存在? + - ANOVA `minLevels=3` 是否排除了两组情况的正确性?(两组应走 T 检验) + +2. **类型要求是否完整?** + - 相关分析:Spearman 也可以接受有序分类变量,是否需要放宽为 `any`? + - Logistic 回归的 `predictors`:是否应区分连续/分类? + +3. **提示语是否专业且易懂?** + - 面向临床医生,提示语是否够简明? + +--- + +## 7. E 类:LLM Prompt 模板 + +### 作用 + +Prompt 模板控制 LLM 在不同场景下的行为。所有 Prompt 存储在数据库中,通过 seed 脚本初始化。 + +### 当前已配置的 12 个 Prompt + +#### 7.1 核心角色定义 + +| Prompt Key | 用途 | 统计相关度 | 审核重点 | +|-----------|------|-----------|---------| +| `SSA_BASE_SYSTEM` | LLM 基础角色定义 | ⭐⭐⭐ | 确认"不做计算"的边界描述是否准确 | + +**当前内容要点:** +- 定义 LLM 为"分析规划者"和"结果解读者" +- 严禁生成任何数值结果(P值、均值、标准差等) +- 所有计算由 R 引擎完成 + +#### 7.2 意图分类 + +| Prompt Key | 用途 | 统计相关度 | 审核重点 | +|-----------|------|-----------|---------| +| `SSA_INTENT_ROUTER` | 意图分类器 | ⭐⭐ | 6 种意图的定义和典型示例是否准确 | +| `SSA_QUERY_INTENT` | 分析意图解析 | ⭐⭐⭐⭐⭐ | Few-Shot 示例、goal 分类、confidence 评分 | + +**`SSA_QUERY_INTENT` 审核重点:** +- **goal 分类是否完整?** 当前:comparison / correlation / regression / descriptive / cohort_study + - 是否需要增加:`survival_analysis`, `diagnostic_test`, `agreement`, `meta_analysis`? +- **Few-Shot 示例是否覆盖常见场景?** 当前 6 个示例 + - 是否需要增加:生存分析、诊断试验、一致性分析的示例? +- **Confidence 评分标准是否合理?** + - 0.9-1.0 = 明确指定 Y 和 X + - 0.7-0.8 = 指定 Y 但 X 需推断 + - 0.5-0.6 = 意图清楚但无变量名 + - <0.5 = 模糊表达需追问 + +#### 7.3 对话场景 + +| Prompt Key | 用途 | 统计相关度 | 审核重点 | +|-----------|------|-----------|---------| +| `SSA_INTENT_CHAT` | 自由对话 | ⭐ | — | +| `SSA_INTENT_EXPLORE` | 数据探索 | ⭐⭐ | 数据质量问题的提醒清单是否完整 | +| `SSA_INTENT_CONSULT` | 方法咨询 | ⭐⭐⭐⭐ | 推荐逻辑、前提条件说明是否准确 | +| `SSA_INTENT_ANALYZE` | 分析协调 | ⭐⭐⭐ | 方案状态说明的措辞 | +| `SSA_INTENT_DISCUSS` | 结果讨论 | ⭐⭐⭐⭐ | P 值解读指导、临床意义讨论准则 | +| `SSA_INTENT_FEEDBACK` | 改进反馈 | ⭐⭐⭐ | 诊断问题的分类框架 | + +#### 7.4 核心任务 Prompt + +| Prompt Key | 用途 | 统计相关度 | 审核重点 | +|-----------|------|-----------|---------| +| `SSA_METHOD_CONSULT` | 方法推荐输出格式 | ⭐⭐⭐⭐ | 推荐理由、前提条件、降级方案的描述准则 | +| `SSA_ANALYZE_PLAN` | 分析方案解释 | ⭐⭐⭐ | 方案解释的非技术化表达准则 | +| `SSA_PICO_INFERENCE` | PICO 结构推断 | ⭐⭐⭐⭐⭐ | Few-Shot 示例、观察性研究处理、Confidence 准则 | +| `SSA_REFLECTION` | 论文级结论生成 | ⭐⭐⭐⭐⭐ | 冲突处理准则、方法学说明格式、局限性模板 | + +### 🔍 专家需审核的关键 Prompt + +**优先级 1(最重要):** + +1. **`SSA_QUERY_INTENT`** — 决定了系统如何理解用户需求 + - Few-Shot 示例是否覆盖临床常见场景? + - 队列研究 vs 横断面调查的区分规则 + - "统计学意义"这类表达的意图分类是否正确 + +2. **`SSA_REFLECTION`** — 决定了最终报告的质量 + - 冲突处理准则(主分析与敏感性分析不一致时) + - 方法学段落的撰写指导 + - 局限性的标准表述 + +3. **`SSA_PICO_INFERENCE`** — 决定了数据理解的准确性 + - 观察性研究 intervention=null 的判断准则 + - 变量名引用规则 + - Confidence 评分准则 + +**优先级 2:** + +4. **`SSA_INTENT_CONSULT`** — 方法咨询的准确性 +5. **`SSA_INTENT_DISCUSS`** — P 值和效应量的解读指导 +6. **`SSA_METHOD_CONSULT`** — 推荐方法的输出格式 + +--- + +## 8. F 类:意图识别规则 + +**文件:** `backend/src/modules/ssa/config/intent_rules.json` + +### 作用 + +系统首先用 **关键词规则引擎**(零延迟)判断用户意图。规则无法确定时,才使用 LLM 兜底。 + +### 当前已配置的 5 类意图规则 + +| 意图 | 关键词 | 排除词 | 前置条件 | 优先级 | +|------|--------|--------|---------|--------| +| **analyze** | 分析、检验、t检验、卡方、回归、比较一下、跑一下、执行分析、做个分析、方差分析、ANOVA、相关分析、logistic、生存分析、Cox、基线表 | 什么方法、用什么、应该怎么、推荐 | 有数据 | 10 | +| **discuss** | 什么意思、说明什么、怎么解释、p值、置信区间、结果说明、为什么显著、为什么不显著、临床意义、效应量 | — | 有数据+有结果 | 9 | +| **feedback** | 结果不对、不太对、换个方法、重新分析、有问题、不满意、重做 | — | 有数据+有结果 | 9 | +| **explore** | 看看、分布、缺失、概况、有哪些变量、数据特征、异常值、样本量、描述一下数据、多少例、变量类型 | — | 有数据 | 8 | +| **consult** | 什么方法、用什么、应该怎么分析、推荐方法、分析方案、哪种检验、怎么选、前提条件 | — | 有数据 | 7 | + +### 🔍 专家需审核的问题 + +1. **关键词是否完整?** + - analyze 缺少:生存分析相关(生存曲线、KM、hazard ratio)、诊断试验相关(ROC、AUC、敏感性) + - consult 缺少:样本量估计、效能分析相关 + - discuss 缺少:多重比较校正相关(Bonferroni、FDR) + +2. **排除词是否合理?** + - 当前 analyze 排除"什么方法、用什么"以避免误判为 consult,但用户说"用 T 检验分析"是否会被误排除? + +3. **默认意图 `chat` 是否是最佳兜底?** + +--- + +## 9. G 类:统计工具注册表 + +**文件:** `backend/src/modules/ssa/config/tools_registry.json` + +### 作用 + +注册所有可用的统计工具元信息,包括工具代码、参数列表、输出类型、前提条件和降级方法。 + +### 当前注册的 12 个工具 + +| 工具代码 | 工具名称 | 分类 | 前提条件 | 降级方法 | +|---------|---------|------|---------|---------| +| `ST_DESCRIPTIVE` | 描述性统计 | basic | — | — | +| `ST_T_TEST_IND` | 独立样本T检验 | parametric | 正态分布 | ST_MANN_WHITNEY | +| `ST_MANN_WHITNEY` | Mann-Whitney U检验 | nonparametric | — | — | +| `ST_T_TEST_PAIRED` | 配对T检验 | parametric | — | — | +| `ST_CHI_SQUARE` | 卡方检验 | categorical | — | ST_FISHER | +| `ST_CORRELATION` | 相关分析 | correlation | — | — | +| `ST_LOGISTIC_BINARY` | 二元Logistic回归 | regression | — | — | +| `ST_FISHER` | Fisher精确检验 | categorical | — | — | +| `ST_ANOVA_ONE` | 单因素方差分析 | parametric | 正态分布+方差齐性 | Kruskal-Wallis | +| `ST_WILCOXON` | Wilcoxon符号秩检验 | nonparametric | — | — | +| `ST_LINEAR_REG` | 线性回归 | regression | — | — | +| `ST_BASELINE_TABLE` | 基线特征表 | composite | — | — | + +### 🔍 专家需审核的问题 + +1. **前提条件字段(prerequisite)需要补全** + - 多数工具未填写前提条件,但实际 R 代码中有检查 + - 建议统一补全,用于向用户展示 + +2. **category 分类是否需要调整?** + - 当前:basic / parametric / nonparametric / categorical / correlation / regression / composite + - 是否需要更精细的分类? + +3. **配对 T 检验缺少 fallback 声明** + - `ST_T_TEST_PAIRED` 的 fallback 应为 `ST_WILCOXON` + +--- + +## 10. 当前状态与待办事项 + +### ✅ 已完成(系统可运行) + +| 项目 | 说明 | +|------|------| +| 12 个 R 工具全部实现 | 含内置护栏和自动降级 | +| 11 条决策表规则 | 覆盖常见分析场景 | +| 5 个流程模板 | 含队列研究全套模板 | +| 12 个 Prompt 入库 | 含角色定义、意图分类、结论生成 | +| 参数约束表全覆盖 | 12 工具 27 个参数 | +| 变量可编辑化 | 医生可修改系统默认选择 | + +### 🟡 需要专家审核/完善 + +| 优先级 | 项目 | 预估工作量 | +|--------|------|-----------| +| P0 | 决策表降级条件审核 | 1-2 小时 | +| P0 | R 工具护栏阈值审核 | 2-3 小时 | +| P0 | `SSA_QUERY_INTENT` Prompt Few-Shot 审核 | 1 小时 | +| P0 | `SSA_REFLECTION` 冲突处理准则审核 | 1 小时 | +| P1 | 参数约束表水平限制审核 | 30 分钟 | +| P1 | 意图识别关键词补充 | 30 分钟 | +| P1 | 队列研究模板步骤审核 | 1 小时 | +| P2 | 工具注册表前提条件补全 | 30 分钟 | +| P2 | PICO 推断 Prompt 示例审核 | 30 分钟 | + +### 🔴 未来扩展(新统计方法) + +| 统计方法 | 涉及配置项 | 说明 | +|---------|-----------|------| +| **生存分析** | 新 R 工具 + 决策表 + 流程模板 + Prompt 示例 | Kaplan-Meier + Cox | +| **诊断试验** | 新 R 工具 + 决策表 + 流程模板 | ROC/AUC/敏感性/特异性 | +| **一致性分析** | 新 R 工具 + 决策表 | Kappa / ICC | +| **倾向性评分** | 新 R 工具 + 流程模板 | PSM + 匹配后分析 | +| **混合效应模型** | 新 R 工具 + 决策表 | 重复测量/多水平 | +| **样本量估计** | 新 R 工具 + 意图规则 | 效能分析 | +| **Meta 分析** | 新 R 工具 + 完整子系统 | 固定/随机效应 | + +--- + +## 11. 专家审核工作流程建议 + +### 推荐审核顺序 + +``` +第一轮:理解架构(30 分钟) +├── 阅读本文档第 1 节(系统架构概览) +└── 理解 QPER 四层流水线和配置关系 + +第二轮:审核统计核心(3-4 小时) +├── A. 决策表 → 确认方法选择规则和降级条件 +├── C. R 工具 → 审核护栏阈值和统计实现 +├── D. 参数约束 → 确认变量类型要求 +└── B. 流程模板 → 审核步骤编排逻辑 + +第三轮:审核 LLM 指导(2-3 小时) +├── E. SSA_QUERY_INTENT → 审核 Few-Shot 和 goal 分类 +├── E. SSA_REFLECTION → 审核结论生成准则 +├── E. SSA_PICO_INFERENCE → 审核研究设计推断 +└── F. 意图规则 → 补充统计领域关键词 + +第四轮:规划扩展(1-2 小时) +├── 确定优先需要新增的统计方法 +├── 为新方法准备决策表规则 +└── 为新方法准备 Few-Shot 示例 +``` + +### 配置文件位置速查 + +| 配置文件 | 路径 | +|---------|------| +| 决策表 | `backend/src/modules/ssa/config/decision_tables.json` | +| 流程模板 | `backend/src/modules/ssa/config/flow_templates.json` | +| 工具注册表 | `backend/src/modules/ssa/config/tools_registry.json` | +| 参数约束 | `backend/src/modules/ssa/config/tool_param_constraints.json` | +| 意图规则 | `backend/src/modules/ssa/config/intent_rules.json` | +| Prompt 种子 | `backend/scripts/seed-ssa-*.ts` | +| R 工具 | `r-statistics-service/tools/*.R` | + +### 反馈格式建议 + +专家审核后,建议按以下格式反馈: + +```markdown +## 审核反馈 — [配置项名称] + +### 确认正确的部分 +- [列出审核通过的规则/配置] + +### 需要修改的部分 +- [配置项]: [当前值] → [建议值] +- [原因说明] + +### 需要新增的部分 +- [新规则/新工具/新示例] +- [适用场景说明] + +### 需要讨论的问题 +- [有争议或需要权衡的决策] +``` + +--- + +**文档版本:** v1.0 +**创建日期:** 2026-02-23 +**维护者:** 开发团队 +**下次更新触发条件:** 专家审核反馈后 / 新增统计工具时 diff --git a/docs/06-测试文档/extraction-results-ef8ef651-ecd9-45a1-923f-2aac09c66031.xlsx b/docs/06-测试文档/extraction-results-ef8ef651-ecd9-45a1-923f-2aac09c66031.xlsx new file mode 100644 index 00000000..e6a164db Binary files /dev/null and b/docs/06-测试文档/extraction-results-ef8ef651-ecd9-45a1-923f-2aac09c66031.xlsx differ diff --git a/frontend-v2/src/modules/asl/api/index.ts b/frontend-v2/src/modules/asl/api/index.ts index f0b8cba9..709f65b1 100644 --- a/frontend-v2/src/modules/asl/api/index.ts +++ b/frontend-v2/src/modules/asl/api/index.ts @@ -528,6 +528,97 @@ export async function getDeepResearchTask( return request(`/research/tasks/${taskId}`); } +// ==================== 工具 3:全文智能提取 API ==================== + +export async function getExtractionTemplates(): Promise> { + return request('/extraction/templates'); +} + +export async function getExtractionTemplate(templateId: string): Promise> { + return request(`/extraction/templates/${templateId}`); +} + +export async function cloneExtractionTemplate( + projectId: string, + baseTemplateId: string +): Promise> { + return request('/extraction/templates/clone', { + method: 'POST', + body: JSON.stringify({ projectId, baseTemplateId }), + }); +} + +export async function getExtractionKnowledgeBases(): Promise> { + return request('/extraction/knowledge-bases'); +} + +export async function getExtractionDocuments(kbId: string): Promise> { + return request(`/extraction/knowledge-bases/${kbId}/documents`); +} + +export async function createExtractionTask(params: { + projectId: string; + projectTemplateId: string; + pkbKnowledgeBaseId: string; + documentIds: string[]; + idempotencyKey?: string; +}): Promise> { + return request('/extraction/tasks', { + method: 'POST', + body: JSON.stringify(params), + }); +} + +export async function getExtractionTaskStatus( + taskId: string +): Promise> { + return request(`/extraction/tasks/${taskId}`); +} + +export async function getExtractionTaskResults( + taskId: string +): Promise> { + return request(`/extraction/tasks/${taskId}/results`); +} + +export async function getExtractionResultDetail( + resultId: string +): Promise> { + return request(`/extraction/results/${resultId}`); +} + +export async function reviewExtractionResult( + resultId: string, + data: { reviewStatus: 'approved' | 'rejected' } +): Promise> { + return request(`/extraction/results/${resultId}/review`, { + method: 'PUT', + body: JSON.stringify(data), + }); +} + +export async function exportExtractionResults( + taskId: string +): Promise { + const response = await fetch( + `${API_BASE_URL}/extraction/tasks/${taskId}/export`, + { headers: getAuthHeaders() } + ); + if (!response.ok) { + throw new Error(`HTTP ${response.status}`); + } + return response.blob(); +} + // ==================== 统一导出API对象 ==================== /** @@ -584,4 +675,17 @@ export const aslApi = { generateRequirement, executeDeepResearchTask, getDeepResearchTask, + + // 工具 3:全文智能提取 + getExtractionTemplates, + getExtractionTemplate, + cloneExtractionTemplate, + getExtractionKnowledgeBases, + getExtractionDocuments, + createExtractionTask, + getExtractionTaskStatus, + getExtractionTaskResults, + getExtractionResultDetail, + reviewExtractionResult, + exportExtractionResults, }; diff --git a/frontend-v2/src/modules/asl/components/ASLLayout.tsx b/frontend-v2/src/modules/asl/components/ASLLayout.tsx index ec9956bc..2ad70270 100644 --- a/frontend-v2/src/modules/asl/components/ASLLayout.tsx +++ b/frontend-v2/src/modules/asl/components/ASLLayout.tsx @@ -95,11 +95,21 @@ const ASLLayout = () => { ], }, { - key: 'data-extraction', + key: 'extraction', icon: , - label: '6. 全文解析与数据提取', - disabled: true, - title: '敬请期待' + label: '6. 全文智能提取', + children: [ + { + key: '/literature/extraction/setup', + icon: , + label: '配置与启动', + }, + { + key: '/literature/extraction/workbench', + icon: , + label: '审核工作台', + }, + ], }, { key: 'data-analysis', @@ -125,6 +135,7 @@ const ASLLayout = () => { const getOpenKeys = () => { if (currentPath.includes('screening/title')) return ['title-screening']; if (currentPath.includes('screening/fulltext')) return ['fulltext-screening']; + if (currentPath.includes('/extraction')) return ['extraction']; return []; }; const openKeys = getOpenKeys(); diff --git a/frontend-v2/src/modules/asl/components/extraction/ExtractionDrawer.tsx b/frontend-v2/src/modules/asl/components/extraction/ExtractionDrawer.tsx new file mode 100644 index 00000000..c34e058f --- /dev/null +++ b/frontend-v2/src/modules/asl/components/extraction/ExtractionDrawer.tsx @@ -0,0 +1,302 @@ +/** + * 智能审核抽屉 — 700px 右侧 Drawer + * 动态读取后端返回的 schema(模板字段定义)来构建审核面板 + * 每个字段下方附带 QuoteBlock 溯源展示 + * 底部 Footer: [取消] + [核准保存] + */ + +import { useState, useEffect, useMemo } from 'react'; +import { Drawer, Collapse, Button, Space, Spin, Typography, message } from 'antd'; +import { + CheckCircleOutlined, + FileTextOutlined, + UserOutlined, + SafetyCertificateOutlined, + BarChartOutlined, + FilePdfOutlined, +} from '@ant-design/icons'; +import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'; +import { aslApi } from '../../api'; +import ExtractionStatusBadge from './ExtractionStatusBadge'; +import FieldGroup from './FieldGroup'; + +const { Text } = Typography; + +const MODULE_ICONS: Record = { + metadata: , + baseline: , + rob: , + outcomes: , +}; + +const MODULE_LABELS: Record = { + metadata: '基础元数据', + baseline: '基线特征', + rob: '偏倚风险评估', + outcomes_survival: '结局指标(生存)', + outcomes_dichotomous: '结局指标(二分类)', + outcomes_continuous: '结局指标(连续)', +}; + +/** + * 将 LLM 返回的模块数据归一化为扁平 key-value 映射 + * 兼容: [{key, value, quote}] 数组 | {field: value, field_quote: "..."} | {field: {value, quote}} + */ +function flattenModuleData(moduleData: any): Record { + if (!moduleData) return {}; + if (Array.isArray(moduleData)) { + const flat: Record = {}; + for (const item of moduleData) { + if (typeof item === 'object' && item !== null && 'key' in item) { + flat[item.key] = item.value ?? null; + if (item.quote) flat[`${item.key}_quote`] = item.quote; + } + } + return flat; + } + if (typeof moduleData === 'object') { + const flat: Record = {}; + for (const [k, v] of Object.entries(moduleData)) { + if (typeof v === 'object' && v !== null && !Array.isArray(v) && 'value' in (v as any)) { + flat[k] = (v as any).value ?? null; + if ((v as any).quote) flat[`${k}_quote`] = (v as any).quote; + } else { + flat[k] = v; + } + } + return flat; + } + return {}; +} + +function getStudyId(data: any): string { + if (!data) return ''; + const meta = data.metadata; + if (!meta) return ''; + const flat = flattenModuleData(meta); + const v = flat.study_id; + if (v === null || v === undefined) return ''; + return String(v); +} + +interface Props { + open: boolean; + resultId: string | null; + taskId: string; + onClose: () => void; + onSaved: () => void; +} + +const ExtractionDrawer: React.FC = ({ open, resultId, taskId: _taskId, onClose, onSaved }) => { + const queryClient = useQueryClient(); + const [activeKeys, setActiveKeys] = useState(['metadata']); + + const { data: resultResp, isLoading } = useQuery({ + queryKey: ['extraction-result-detail', resultId], + queryFn: () => aslApi.getExtractionResultDetail(resultId!), + enabled: open && !!resultId, + }); + const result = resultResp?.data; + + useEffect(() => { + if (open) setActiveKeys(['metadata']); + }, [open, resultId]); + + const reviewMutation = useMutation({ + mutationFn: (status: 'approved' | 'rejected') => + aslApi.reviewExtractionResult(resultId!, { reviewStatus: status }), + onSuccess: () => { + message.success('审核已保存'); + queryClient.invalidateQueries({ queryKey: ['extraction-result-detail', resultId] }); + onSaved(); + }, + onError: (err: any) => { + message.error(err.message || '审核保存失败'); + }, + }); + + const collapseItems = useMemo(() => { + if (!result?.extractedData) return []; + const data = result.extractedData as Record; + const quoteVerification = (result.quoteVerification || {}) as Record; + const schema = result.schema as Record | undefined; + + // Determine module list from schema or fallback to data keys + const moduleKeys = schema + ? Object.keys(schema) + : Object.keys(data); + + return moduleKeys.map((modKey) => { + const schemaFields = schema?.[modKey] as Array<{ key: string; label?: string; type?: string }> | undefined; + + // Find matching data (handle outcomes_* prefix matching) + const rawModule = data[modKey] + || (modKey.startsWith('outcomes') ? Object.entries(data).find(([k]) => k.startsWith('outcomes_'))?.[1] : null); + const flat = flattenModuleData(rawModule); + + const rawQuotes = quoteVerification[modKey] + || (modKey.startsWith('outcomes') ? Object.entries(quoteVerification).find(([k]) => k.startsWith('outcomes_'))?.[1] : null); + const flatQuotes = flattenModuleData(rawQuotes); + + // Build field list from schema (ordered) + extra keys from data + const seenKeys = new Set(); + const fields: Array<{ key: string; label: string; value: any; quoteVerification?: any }> = []; + + if (schemaFields) { + for (const sf of schemaFields) { + if (!sf.key) continue; + seenKeys.add(sf.key); + const val = flat[sf.key]; + if (val === undefined || val === null) continue; + fields.push({ + key: sf.key, + label: sf.label || sf.key.replace(/_/g, ' ').replace(/\b\w/g, c => c.toUpperCase()), + value: val, + quoteVerification: flatQuotes[sf.key] + || (flat[`${sf.key}_quote`] ? { confidence: 'medium' as const, quote: flat[`${sf.key}_quote`] } : undefined), + }); + } + } + + // Append extra fields not in schema + for (const k of Object.keys(flat)) { + if (k.endsWith('_quote') || seenKeys.has(k)) continue; + seenKeys.add(k); + fields.push({ + key: k, + label: k.replace(/_/g, ' ').replace(/\b\w/g, c => c.toUpperCase()), + value: flat[k], + quoteVerification: flatQuotes[k] + || (flat[`${k}_quote`] ? { confidence: 'medium' as const, quote: flat[`${k}_quote`] } : undefined), + }); + } + + const baseModKey = modKey.startsWith('outcomes_') ? 'outcomes' : modKey; + const icon = MODULE_ICONS[baseModKey] || ; + const defaultLabel = MODULE_LABELS[modKey] || modKey.replace(/_/g, ' ').replace(/\b\w/g, c => c.toUpperCase()); + + return { + key: modKey, + label: ( + + {icon} + {defaultLabel} + ({fields.length} 字段) + + ), + children: , + }; + }); + }, [result]); + + const isApproved = result?.reviewStatus === 'approved'; + const studyId = result ? getStudyId(result.extractedData) : ''; + + return ( + + + + + {studyId || result.snapshotFilename} + + + + ) : ( + '加载中...' + ) + } + styles={{ wrapper: { width: 700 } }} + open={open} + onClose={onClose} + destroyOnClose + footer={ + result && ( +
+ + + {!isApproved && ( + + )} + + +
+ ) + } + > + {isLoading || !result ? ( +
+ +
+ ) : ( + <> + {result.quoteVerification && ( +
+ + 原文溯源覆盖率 + {(() => { + const qv = result.quoteVerification || {}; + let high = 0, medium = 0, low = 0, total = 0; + Object.values(qv).forEach((mod: any) => { + if (typeof mod === 'object' && mod !== null) { + Object.values(mod).forEach((field: any) => { + if (field && typeof field === 'object' && 'confidence' in field) { + total++; + if (field.confidence === 'high') high++; + else if (field.confidence === 'medium') medium++; + else low++; + } + }); + } + }); + return ( + + {high} 高 + {medium} 中 + {low} 低 + / {total} 字段 + + ); + })()} + + +
+ )} + + setActiveKeys(keys as string[])} + items={collapseItems} + /> + + )} +
+ ); +}; + +export default ExtractionDrawer; diff --git a/frontend-v2/src/modules/asl/components/extraction/ExtractionStatusBadge.tsx b/frontend-v2/src/modules/asl/components/extraction/ExtractionStatusBadge.tsx new file mode 100644 index 00000000..25116b74 --- /dev/null +++ b/frontend-v2/src/modules/asl/components/extraction/ExtractionStatusBadge.tsx @@ -0,0 +1,28 @@ +/** + * 提取审核状态标签组件 + * pending_review / approved / rejected + */ + +import { Tag } from 'antd'; +import { CheckCircleOutlined, ClockCircleOutlined, CloseCircleOutlined } from '@ant-design/icons'; + +interface Props { + status: string; +} + +const statusMap: Record = { + pending_review: { color: 'orange', text: '待审核', icon: }, + approved: { color: 'green', text: '已核准', icon: }, + rejected: { color: 'red', text: '已驳回', icon: }, +}; + +const ExtractionStatusBadge: React.FC = ({ status }) => { + const { color, text, icon } = statusMap[status] || statusMap.pending_review; + return ( + + {text} + + ); +}; + +export default ExtractionStatusBadge; diff --git a/frontend-v2/src/modules/asl/components/extraction/FieldGroup.tsx b/frontend-v2/src/modules/asl/components/extraction/FieldGroup.tsx new file mode 100644 index 00000000..70cc75de --- /dev/null +++ b/frontend-v2/src/modules/asl/components/extraction/FieldGroup.tsx @@ -0,0 +1,67 @@ +/** + * 字段组渲染 — 用于审核抽屉内的 Collapse Panel 内容 + * 将 extractedData 中一个模块的字段渲染为 label / value / quote 三行结构 + */ + +import React from 'react'; +import { Descriptions, Typography, Empty } from 'antd'; +import QuoteBlock from './QuoteBlock'; + +const { Text } = Typography; + +interface FieldItem { + key: string; + label: string; + value: any; + quoteVerification?: { + confidence: 'high' | 'medium' | 'low'; + quote?: string; + matchScore?: number; + }; +} + +interface Props { + fields: FieldItem[]; + readOnly?: boolean; +} + +function formatValue(val: any): string { + if (val === null || val === undefined) return '-'; + if (typeof val === 'object' && val !== null && 'value' in val) return formatValue(val.value); + if (Array.isArray(val)) return val.map(formatValue).join(', '); + if (typeof val === 'object') return JSON.stringify(val, null, 2); + return String(val); +} + +const FieldGroup: React.FC = ({ fields, readOnly = false }) => { + if (!fields || fields.length === 0) { + return ; + } + + return ( + + {fields.map((f) => ( + +
+ {formatValue(f.value)} + {f.quoteVerification && ( + + )} +
+
+ ))} +
+ ); +}; + +export default React.memo(FieldGroup); diff --git a/frontend-v2/src/modules/asl/components/extraction/ProcessingTerminal.tsx b/frontend-v2/src/modules/asl/components/extraction/ProcessingTerminal.tsx new file mode 100644 index 00000000..bc1f28e2 --- /dev/null +++ b/frontend-v2/src/modules/asl/components/extraction/ProcessingTerminal.tsx @@ -0,0 +1,103 @@ +/** + * 深色终端日志组件 — 用于 Step 2 提取进度页 + * 使用 SSE 推送实时日志,优雅降级为 "暂无日志" 提示 + * 颜色方案: [MinerU] 蓝色 / [DeepSeek] 紫色 / [System] 绿色 + */ + +import { useEffect, useRef, useState } from 'react'; +import { Card } from 'antd'; +import { CodeOutlined } from '@ant-design/icons'; +import useExtractionLogs from '../../hooks/useExtractionLogs'; + +interface Props { + taskId: string; +} + +interface LogEntry { + timestamp: string; + source: string; + message: string; + level: 'info' | 'warn' | 'error'; +} + +const sourceColorMap: Record = { + MinerU: '#58a6ff', + DeepSeek: '#bc8cff', + System: '#7ee787', + Aggregator: '#ffa657', + Worker: '#79c0ff', +}; + +function getSourceColor(source: string): string { + return sourceColorMap[source] || '#8b949e'; +} + +function getLevelColor(level: string): string { + if (level === 'error') return '#f85149'; + if (level === 'warn') return '#d29922'; + return '#c9d1d9'; +} + +const ProcessingTerminal: React.FC = ({ taskId }) => { + const containerRef = useRef(null); + const { logs, connected } = useExtractionLogs(taskId); + const [autoScroll, setAutoScroll] = useState(true); + + useEffect(() => { + if (autoScroll && containerRef.current) { + containerRef.current.scrollTop = containerRef.current.scrollHeight; + } + }, [logs, autoScroll]); + + const handleScroll = () => { + if (!containerRef.current) return; + const { scrollTop, scrollHeight, clientHeight } = containerRef.current; + setAutoScroll(scrollHeight - scrollTop - clientHeight < 40); + }; + + return ( + + + 处理日志 + {connected && ( + + )} + + } + className="border-gray-700" + styles={{ + header: { backgroundColor: '#1e1e2e', borderBottom: '1px solid #333' }, + body: { padding: 0 }, + }} + > +
+ {logs.length === 0 ? ( +
+ {connected ? '等待日志数据...' : '日志流未连接,进度数据由轮询驱动'} +
+ ) : ( + logs.map((log: LogEntry, i: number) => ( +
+ {log.timestamp} + [{log.source}] + {log.message} +
+ )) + )} +
+
+ ); +}; + +export default ProcessingTerminal; diff --git a/frontend-v2/src/modules/asl/components/extraction/QuoteBlock.tsx b/frontend-v2/src/modules/asl/components/extraction/QuoteBlock.tsx new file mode 100644 index 00000000..d91d0f25 --- /dev/null +++ b/frontend-v2/src/modules/asl/components/extraction/QuoteBlock.tsx @@ -0,0 +1,137 @@ +/** + * AI 原文溯源展示块 + * - 灰色背景 + 关键数字黄色 mark 高亮 + * - 三级置信度 Badge(green / yellow / red) + * - 红色警告时显示 [强制认可] + [手动修改数值] 按钮 + */ + +import React, { useState } from 'react'; +import { Tag, Button, Input, Space, Typography, Tooltip } from 'antd'; +import { CheckOutlined, EditOutlined, WarningOutlined } from '@ant-design/icons'; + +const { Text } = Typography; + +interface QuoteVerification { + confidence: 'high' | 'medium' | 'low'; + quote?: string; + matchScore?: number; +} + +interface Props { + value: any; + quoteVerification?: QuoteVerification; + onForceAccept?: () => void; + onManualEdit?: (newValue: string) => void; + readOnly?: boolean; +} + +const confidenceMap = { + high: { color: 'green' as const, text: '高置信度', tooltip: '原文精确匹配' }, + medium: { color: 'orange' as const, text: '中置信度', tooltip: '原文模糊匹配,建议核查' }, + low: { color: 'red' as const, text: '低置信度', tooltip: '未在原文中找到匹配,需人工校验' }, +}; + +function highlightNumbers(text: string): React.ReactNode[] { + const parts = text.split(/(\d+\.?\d*%?)/g); + return parts.map((part, i) => + /^\d+\.?\d*%?$/.test(part) ? ( + + {part} + + ) : ( + {part} + ) + ); +} + +const QuoteBlock: React.FC = ({ + value, + quoteVerification, + onForceAccept, + onManualEdit, + readOnly = false, +}) => { + const [editing, setEditing] = useState(false); + const [editValue, setEditValue] = useState(String(value ?? '')); + + if (!quoteVerification?.quote) return null; + + const { confidence, quote, matchScore } = quoteVerification; + const conf = confidenceMap[confidence] || confidenceMap.medium; + + return ( +
+
+ + AI 原文溯源 + + + {conf.text} + {matchScore !== undefined && ` (${(matchScore * 100).toFixed(0)}%)`} + + + +
+ +
+ “{highlightNumbers(quote)}” +
+ + {confidence === 'low' && !readOnly && ( +
+ {onForceAccept && ( + + )} + {onManualEdit && !editing && ( + + )} + {editing && ( + + setEditValue(e.target.value)} + style={{ width: 200 }} + /> + + + )} +
+ )} + + {confidence === 'low' && ( +
+ + 该数据未在原文中找到匹配,请仔细核查 +
+ )} +
+ ); +}; + +export default React.memo(QuoteBlock); diff --git a/frontend-v2/src/modules/asl/hooks/useExtractionLogs.ts b/frontend-v2/src/modules/asl/hooks/useExtractionLogs.ts new file mode 100644 index 00000000..7fe29f25 --- /dev/null +++ b/frontend-v2/src/modules/asl/hooks/useExtractionLogs.ts @@ -0,0 +1,96 @@ +/** + * SSE 连接管理 Hook — 用于 ProcessingTerminal + * 连接 GET /api/v1/asl/extraction/tasks/:taskId/stream + * 优雅降级:连接失败时 connected=false,前端仅依赖 React Query 轮询 + */ + +import { useState, useEffect, useRef, useCallback } from 'react'; +import { getAccessToken } from '../../../framework/auth/api'; + +interface LogEntry { + timestamp: string; + source: string; + message: string; + level: 'info' | 'warn' | 'error'; +} + +interface UseExtractionLogsResult { + logs: LogEntry[]; + connected: boolean; +} + +const MAX_LOGS = 500; + +export default function useExtractionLogs(taskId: string): UseExtractionLogsResult { + const [logs, setLogs] = useState([]); + const [connected, setConnected] = useState(false); + const eventSourceRef = useRef(null); + const retryCountRef = useRef(0); + const maxRetries = 3; + + const addLog = useCallback((entry: LogEntry) => { + setLogs((prev) => { + const next = [...prev, entry]; + return next.length > MAX_LOGS ? next.slice(next.length - MAX_LOGS) : next; + }); + }, []); + + useEffect(() => { + if (!taskId) return; + + const token = getAccessToken(); + const url = `/api/v1/asl/extraction/tasks/${taskId}/stream${token ? `?token=${token}` : ''}`; + + function connect() { + const es = new EventSource(url); + eventSourceRef.current = es; + + es.onopen = () => { + setConnected(true); + retryCountRef.current = 0; + }; + + es.addEventListener('sync', (e: MessageEvent) => { + try { + const data = JSON.parse(e.data); + if (Array.isArray(data.logs)) { + setLogs(data.logs.slice(-MAX_LOGS)); + } + } catch { /* ignore */ } + }); + + es.addEventListener('log', (e: MessageEvent) => { + try { + const entry = JSON.parse(e.data) as LogEntry; + addLog(entry); + } catch { /* ignore */ } + }); + + es.addEventListener('error', (e: MessageEvent) => { + try { + const entry = JSON.parse(e.data) as LogEntry; + addLog({ ...entry, level: 'error' }); + } catch { /* ignore */ } + }); + + es.onerror = () => { + es.close(); + setConnected(false); + if (retryCountRef.current < maxRetries) { + retryCountRef.current++; + setTimeout(connect, 2000 * retryCountRef.current); + } + }; + } + + connect(); + + return () => { + eventSourceRef.current?.close(); + eventSourceRef.current = null; + setConnected(false); + }; + }, [taskId, addLog]); + + return { logs, connected }; +} diff --git a/frontend-v2/src/modules/asl/index.tsx b/frontend-v2/src/modules/asl/index.tsx index 45b84eaf..b35a7673 100644 --- a/frontend-v2/src/modules/asl/index.tsx +++ b/frontend-v2/src/modules/asl/index.tsx @@ -25,6 +25,11 @@ const ResearchSearch = lazy(() => import('./pages/ResearchSearch')); // Deep Research V2.0 const DeepResearchPage = lazy(() => import('./pages/DeepResearchPage')); +// 工具 3:全文智能提取(M2 路由拆分) +const ExtractionSetup = lazy(() => import('./pages/ExtractionSetup')); +const ExtractionProgress = lazy(() => import('./pages/ExtractionProgress')); +const ExtractionWorkbench = lazy(() => import('./pages/ExtractionWorkbench')); + const ASLModule = () => { return ( { } /> } /> + + {/* 工具 3:全文智能提取(M2 三步路由) */} + + } /> + } /> + } /> + } /> + diff --git a/frontend-v2/src/modules/asl/pages/ExtractionPage.tsx b/frontend-v2/src/modules/asl/pages/ExtractionPage.tsx new file mode 100644 index 00000000..ed37b453 --- /dev/null +++ b/frontend-v2/src/modules/asl/pages/ExtractionPage.tsx @@ -0,0 +1,331 @@ +/** + * 工具 3:全文智能提取 — 状态驱动路由页面 + * + * Step 1: 选模板 + 选 PKB 文献 → 创建任务 + * Step 2: 轮询进度 + * Step 3: 提取结果列表 + */ + +import { useState, useEffect } from 'react'; +import { Card, Steps, Button, Select, Checkbox, Table, Progress, Tag, Empty, Spin, message, Space, Alert, Typography } from 'antd'; +import { FileTextOutlined, ThunderboltOutlined, CheckCircleOutlined, DatabaseOutlined, ReloadOutlined } from '@ant-design/icons'; +import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'; +import { aslApi } from '../api'; + +const { Title, Text } = Typography; + +type PageStep = 'setup' | 'progress' | 'results'; + +const ExtractionPage = () => { + const queryClient = useQueryClient(); + const [step, setStep] = useState('setup'); + const [taskId, setTaskId] = useState(null); + + // ── Step 1 State ───────────────────────── + const [selectedTemplateId, setSelectedTemplateId] = useState(''); + const [selectedKbId, setSelectedKbId] = useState(''); + const [selectedDocIds, setSelectedDocIds] = useState([]); + const [projectTemplateId, setProjectTemplateId] = useState(''); + + // 临时 projectId(M1 简化:用 Date.now 生成) + const [projectId] = useState(() => `ext-${Date.now()}`); + + // ── 数据查询 ───────────────────────────── + const { data: templatesResp, isLoading: loadingTemplates } = useQuery({ + queryKey: ['extraction-templates'], + queryFn: () => aslApi.getExtractionTemplates(), + }); + const templates = templatesResp?.data || []; + + const { data: kbsResp, isLoading: loadingKbs } = useQuery({ + queryKey: ['extraction-knowledge-bases'], + queryFn: () => aslApi.getExtractionKnowledgeBases(), + }); + const knowledgeBases = kbsResp?.data || []; + + const { data: docsResp, isLoading: loadingDocs } = useQuery({ + queryKey: ['extraction-documents', selectedKbId], + queryFn: () => aslApi.getExtractionDocuments(selectedKbId), + enabled: !!selectedKbId, + }); + const documents = docsResp?.data || []; + + // ── 克隆模板 ───────────────────────────── + const cloneMutation = useMutation({ + mutationFn: () => aslApi.cloneExtractionTemplate(projectId, selectedTemplateId), + onSuccess: (resp) => { + setProjectTemplateId(resp.data?.id || ''); + }, + }); + + useEffect(() => { + if (selectedTemplateId && projectId) { + cloneMutation.mutate(); + } + }, [selectedTemplateId]); + + // ── 创建任务 ───────────────────────────── + const createTaskMutation = useMutation({ + mutationFn: () => + aslApi.createExtractionTask({ + projectId, + projectTemplateId, + pkbKnowledgeBaseId: selectedKbId, + documentIds: selectedDocIds, + idempotencyKey: `${projectId}-${Date.now()}`, + }), + onSuccess: (resp: any) => { + const id = resp.taskId || resp.data?.taskId; + if (id) { + setTaskId(id); + setStep('progress'); + message.success(`任务已创建,正在提取 ${selectedDocIds.length} 篇文献`); + } + }, + onError: (err: any) => { + message.error(err.message || '创建任务失败'); + }, + }); + + // ── Step 2: 轮询进度 ──────────────────── + const { data: statusResp } = useQuery({ + queryKey: ['extraction-task-status', taskId], + queryFn: () => aslApi.getExtractionTaskStatus(taskId!), + enabled: step === 'progress' && !!taskId, + refetchInterval: 3000, + }); + const taskStatus = statusResp?.data; + + useEffect(() => { + if (taskStatus && (taskStatus.status === 'completed' || taskStatus.status === 'failed')) { + setStep('results'); + queryClient.invalidateQueries({ queryKey: ['extraction-task-results', taskId] }); + } + }, [taskStatus?.status]); + + // ── Step 3: 提取结果 ──────────────────── + const { data: resultsResp, isLoading: loadingResults } = useQuery({ + queryKey: ['extraction-task-results', taskId], + queryFn: () => aslApi.getExtractionTaskResults(taskId!), + enabled: step === 'results' && !!taskId, + }); + const results = resultsResp?.data || []; + + // ── 渲染 ───────────────────────────────── + const currentStep = step === 'setup' ? 0 : step === 'progress' ? 1 : 2; + + return ( +
+ + <FileTextOutlined className="mr-2" /> + 全文智能提取工作台 + + + }, + { title: '提取进行中', icon: }, + { title: '提取结果', icon: }, + ]} + /> + + {/* ═══ Step 1: 配置 ═══ */} + {step === 'setup' && ( + + + {/* 模板选择 */} +
+ 选择提取模板 + { setSelectedKbId(v); setSelectedDocIds([]); }} + options={knowledgeBases.map((kb: any) => ({ + value: kb.id, + label: `${kb.name} (${kb.fileCount} 篇)`, + }))} + /> +
+ + {/* 文献列表 */} + {selectedKbId && ( +
+
+ 选择文献 ({selectedDocIds.length}/{documents.length} 已选) + 0} + indeterminate={selectedDocIds.length > 0 && selectedDocIds.length < documents.length} + onChange={(e) => { + setSelectedDocIds( + e.target.checked ? documents.map((d: any) => d.documentId) : [] + ); + }} + > + 全选 + +
+ {loadingDocs ? ( + + ) : documents.length === 0 ? ( + + ) : ( + setSelectedDocIds(vals as string[])} + style={{ width: '100%' }} + > +
+ {documents.map((doc: any) => ( +
+ + {doc.filename} + + ({(doc.fileSizeBytes / 1024 / 1024).toFixed(1)} MB) + + +
+ ))} +
+
+ )} +
+ )} + + {/* 提交按钮 */} + +
+
+ )} + + {/* ═══ Step 2: 进度 ═══ */} + {step === 'progress' && taskStatus && ( + +
+ + 正在提取... + +
+
总计: {taskStatus.totalCount} 篇
+
+ 已完成: {taskStatus.completedCount} + 提取中: {taskStatus.extractingCount} + 等待中: {taskStatus.pendingCount} + {taskStatus.errorCount > 0 && ( + <>失败: {taskStatus.errorCount} + )} +
+
+
+
+ )} + + {/* ═══ Step 3: 结果 ═══ */} + {step === 'results' && ( + + {taskStatus && taskStatus.status === 'failed' && ( + + )} + +
+ 提取结果 + +
+ +
{ + const map: Record = { + completed: { color: 'green', text: '已完成' }, + error: { color: 'red', text: '失败' }, + extracting: { color: 'blue', text: '提取中' }, + pending: { color: 'default', text: '等待中' }, + }; + const { color, text } = map[status] || { color: 'default', text: status }; + return {text}; + }, + }, + { + title: 'Study ID', + dataIndex: ['extractedData', 'metadata', 'study_id'], + render: (v: any) => v || '-', + }, + { + title: '错误信息', + dataIndex: 'errorMessage', + ellipsis: true, + render: (v: any) => v ? {v} : '-', + }, + ]} + /> + +
+ +
+ + )} + + ); +}; + +export default ExtractionPage; diff --git a/frontend-v2/src/modules/asl/pages/ExtractionProgress.tsx b/frontend-v2/src/modules/asl/pages/ExtractionProgress.tsx new file mode 100644 index 00000000..eb660f2e --- /dev/null +++ b/frontend-v2/src/modules/asl/pages/ExtractionProgress.tsx @@ -0,0 +1,140 @@ +/** + * 工具 3 Step 2: 提取进度 + 终端日志 + * 原型图 View 2: 居中布局,进度条 + ProcessingTerminal + * 双轨制:React Query 轮询驱动进度条/跳转,SSE 驱动日志区 + */ + +import { useEffect } from 'react'; +import { useParams, useNavigate } from 'react-router-dom'; +import { Card, Progress, Tag, Button, Typography, Spin, Space, Alert } from 'antd'; +import { CheckCircleOutlined, CloseCircleOutlined, RocketOutlined } from '@ant-design/icons'; +import { useQuery, useQueryClient } from '@tanstack/react-query'; +import { aslApi } from '../api'; +import ProcessingTerminal from '../components/extraction/ProcessingTerminal'; + +const { Title, Text } = Typography; + +const ExtractionProgress = () => { + const { taskId } = useParams<{ taskId: string }>(); + const navigate = useNavigate(); + const queryClient = useQueryClient(); + + const { data: statusResp, isLoading } = useQuery({ + queryKey: ['extraction-task-status', taskId], + queryFn: () => aslApi.getExtractionTaskStatus(taskId!), + enabled: !!taskId, + refetchInterval: (query) => { + const st = query.state.data?.data?.status; + if (st === 'completed' || st === 'failed') return false; + return 3000; + }, + }); + const taskStatus = statusResp?.data; + + // Aggregator cron 可能延迟 1-2 分钟才更新 Task 状态, + // 所以同时检查:Task 状态已收口 OR 所有 Result 均已结束(pending=0 且 extracting=0) + const isDone = + taskStatus?.status === 'completed' || + taskStatus?.status === 'failed' || + (taskStatus && taskStatus.totalCount > 0 && taskStatus.pendingCount === 0 && taskStatus.extractingCount === 0); + + useEffect(() => { + if (isDone && taskId) { + queryClient.invalidateQueries({ queryKey: ['extraction-task-results', taskId] }); + } + }, [isDone, taskId, queryClient]); + + if (!taskId) { + return
缺少 taskId 参数
; + } + + if (isLoading || !taskStatus) { + return ( +
+ +
+ ); + } + + const progressStatus = taskStatus.status === 'failed' ? 'exception' : isDone ? 'success' : 'active'; + + return ( +
+ +
+ {isDone ? ( + taskStatus.status === 'completed' ? ( + + ) : ( + + ) + ) : ( + + )} + + + {isDone + ? taskStatus.status === 'completed' + ? '提取完成!' + : '提取结束(部分失败)' + : '正在智能提取...'} + + + + + +
+ 总计 +
{taskStatus.totalCount}
+
+
+ {taskStatus.completedCount} 完成 +
+
+ {taskStatus.extractingCount} 提取中 +
+
+ {taskStatus.pendingCount} 等待中 +
+ {taskStatus.errorCount > 0 && ( +
+ {taskStatus.errorCount} 失败 +
+ )} +
+
+
+ + + + {isDone && ( +
+ {taskStatus.errorCount > 0 && ( + + )} +
+ +
+
+ )} +
+ ); +}; + +export default ExtractionProgress; diff --git a/frontend-v2/src/modules/asl/pages/ExtractionSetup.tsx b/frontend-v2/src/modules/asl/pages/ExtractionSetup.tsx new file mode 100644 index 00000000..b744d761 --- /dev/null +++ b/frontend-v2/src/modules/asl/pages/ExtractionSetup.tsx @@ -0,0 +1,259 @@ +/** + * 工具 3 Step 1: 配置提取模板 + 选择 PKB 文献 + * 原型图 View 1: 5:2 双栏 — 左3模板 + 右2文献 + */ + +import { useState, useEffect } from 'react'; +import { useNavigate } from 'react-router-dom'; +import { Row, Col, Card, Button, Select, Checkbox, Tag, Empty, Spin, message, Typography } from 'antd'; +import { ThunderboltOutlined, LockOutlined, DatabaseOutlined, FilePdfOutlined } from '@ant-design/icons'; +import { useQuery, useMutation } from '@tanstack/react-query'; +import { aslApi } from '../api'; + +const { Text } = Typography; + +const ExtractionSetup = () => { + const navigate = useNavigate(); + + const [selectedTemplateId, setSelectedTemplateId] = useState(''); + const [selectedKbId, setSelectedKbId] = useState(''); + const [selectedDocIds, setSelectedDocIds] = useState([]); + const [projectTemplateId, setProjectTemplateId] = useState(''); + const [projectId] = useState(() => `ext-${Date.now()}`); + + const { data: templatesResp, isLoading: loadingTemplates } = useQuery({ + queryKey: ['extraction-templates'], + queryFn: () => aslApi.getExtractionTemplates(), + }); + const templates = templatesResp?.data || []; + + const selectedTemplate = templates.find((t: any) => t.id === selectedTemplateId); + + const { data: kbsResp, isLoading: loadingKbs } = useQuery({ + queryKey: ['extraction-knowledge-bases'], + queryFn: () => aslApi.getExtractionKnowledgeBases(), + }); + const knowledgeBases = kbsResp?.data || []; + + const { data: docsResp, isLoading: loadingDocs } = useQuery({ + queryKey: ['extraction-documents', selectedKbId], + queryFn: () => aslApi.getExtractionDocuments(selectedKbId), + enabled: !!selectedKbId, + }); + const documents = docsResp?.data || []; + + const cloneMutation = useMutation({ + mutationFn: () => aslApi.cloneExtractionTemplate(projectId, selectedTemplateId), + onSuccess: (resp) => { + setProjectTemplateId(resp.data?.id || ''); + }, + }); + + useEffect(() => { + if (selectedTemplateId && projectId) { + cloneMutation.mutate(); + } + }, [selectedTemplateId]); + + const createTaskMutation = useMutation({ + mutationFn: () => + aslApi.createExtractionTask({ + projectId, + projectTemplateId, + pkbKnowledgeBaseId: selectedKbId, + documentIds: selectedDocIds, + idempotencyKey: `${projectId}-${Date.now()}`, + }), + onSuccess: (resp: any) => { + const id = resp.taskId || resp.data?.taskId; + if (id) { + message.success(`任务已创建,正在提取 ${selectedDocIds.length} 篇文献`); + navigate(`/literature/extraction/progress/${id}`); + } + }, + onError: (err: any) => { + message.error(err.message || '创建任务失败'); + }, + }); + + const baseFields = selectedTemplate?.baseFields as Record | undefined; + const fieldModuleNames: Record = { + metadata: '基础元数据', + baseline: '基线特征', + rob: '偏倚风险评估', + outcomes_survival: '结局-生存', + outcomes_dichotomous: '结局-二分类', + outcomes_continuous: '结局-连续型', + }; + + return ( +
+ + {/* Left 3/5: Template Configuration */} +
+ + + 步骤 1:配置提取模板 (Schema) + + } + > +
+ 选择系统通用基座 + { setSelectedKbId(v); setSelectedDocIds([]); }} + options={knowledgeBases.map((kb: any) => ({ + value: kb.id, + label: `${kb.name} (${kb.fileCount} 篇)`, + }))} + /> +
+ + {selectedKbId && ( +
+
+ + 文献列表 ({selectedDocIds.length}/{documents.length} 已选) + + 0} + indeterminate={selectedDocIds.length > 0 && selectedDocIds.length < documents.length} + onChange={(e) => { + setSelectedDocIds( + e.target.checked ? documents.map((d: any) => d.documentId) : [] + ); + }} + > + 全选 + +
+ {loadingDocs ? ( +
+ ) : documents.length === 0 ? ( + + ) : ( + setSelectedDocIds(vals as string[])} + style={{ width: '100%' }} + > +
+ {documents.map((doc: any) => ( +
+ + + {doc.filename} + + {(doc.fileSizeBytes / 1024 / 1024).toFixed(1)} MB + + +
+ ))} +
+
+ )} +
+ )} + + {!selectedKbId && ( +
+ 请先选择一个 PKB 知识库 +
+ )} +
+ + + +
+ +
+ + ); +}; + +export default ExtractionSetup; diff --git a/frontend-v2/src/modules/asl/pages/ExtractionWorkbench.tsx b/frontend-v2/src/modules/asl/pages/ExtractionWorkbench.tsx new file mode 100644 index 00000000..8d316e63 --- /dev/null +++ b/frontend-v2/src/modules/asl/pages/ExtractionWorkbench.tsx @@ -0,0 +1,281 @@ +/** + * 工具 3 Step 3: 全屏审核工作台 + * 原型图 View 3: 全宽表格 + 700px 右侧审核抽屉 + */ + +import { useState } from 'react'; +import { useParams, useNavigate } from 'react-router-dom'; +import { Table, Tag, Button, Alert, Space, Typography, message } from 'antd'; +import { + CheckCircleOutlined, + EyeOutlined, + DownloadOutlined, + ReloadOutlined, + ArrowLeftOutlined, + ExclamationCircleOutlined, +} from '@ant-design/icons'; +import { useQuery, useQueryClient } from '@tanstack/react-query'; +import { aslApi } from '../api'; +import ExtractionDrawer from '../components/extraction/ExtractionDrawer'; +import ExtractionStatusBadge from '../components/extraction/ExtractionStatusBadge'; + +const { Title, Text } = Typography; + +/** + * 从 extractedData 中提取某个模块某个字段的实际值 + * 兼容数组格式 [{key, value, quote}] 和扁平格式 {field: value} + */ +function getExtractedField(data: any, moduleName: string, fieldKey: string): string { + if (!data) return ''; + // 找到模块数据(兼容 outcomes_survival 等) + let modData = data[moduleName]; + if (!modData) { + for (const k of Object.keys(data)) { + if (k.startsWith(moduleName) || (moduleName === 'outcomes' && k.startsWith('outcomes_'))) { + modData = data[k]; + break; + } + } + } + if (!modData) return ''; + + // 数组格式 + if (Array.isArray(modData)) { + const item = modData.find((f: any) => f?.key === fieldKey); + if (!item) return ''; + const v = item.value; + if (v === null || v === undefined) return ''; + if (typeof v === 'object') return JSON.stringify(v); + return String(v); + } + + // 扁平对象格式 + const val = modData[fieldKey]; + if (val === null || val === undefined) return ''; + if (typeof val === 'object' && 'value' in val) return String(val.value ?? ''); + if (typeof val === 'object') return JSON.stringify(val); + return String(val); +} + +const ExtractionWorkbench = () => { + const { taskId } = useParams<{ taskId: string }>(); + const navigate = useNavigate(); + const queryClient = useQueryClient(); + + const [selectedResultId, setSelectedResultId] = useState(null); + const [drawerVisible, setDrawerVisible] = useState(false); + + const { data: statusResp } = useQuery({ + queryKey: ['extraction-task-status', taskId], + queryFn: () => aslApi.getExtractionTaskStatus(taskId!), + enabled: !!taskId, + }); + const taskStatus = statusResp?.data; + + const { data: resultsResp, isLoading: loadingResults } = useQuery({ + queryKey: ['extraction-task-results', taskId], + queryFn: () => aslApi.getExtractionTaskResults(taskId!), + enabled: !!taskId, + }); + const results = (resultsResp?.data || []) as any[]; + + const approvedCount = results.filter((r: any) => r.reviewStatus === 'approved').length; + const pendingCount = results.filter((r: any) => r.status === 'completed' && r.reviewStatus !== 'approved').length; + const errorCount = results.filter((r: any) => r.status === 'error').length; + + const handleOpenDrawer = (resultId: string) => { + setSelectedResultId(resultId); + setDrawerVisible(true); + }; + + const handleDrawerClose = () => { + setDrawerVisible(false); + setSelectedResultId(null); + }; + + const handleReviewSaved = () => { + queryClient.invalidateQueries({ queryKey: ['extraction-task-results', taskId] }); + }; + + const handleExportExcel = async () => { + if (!taskId) return; + try { + const blob = await aslApi.exportExtractionResults(taskId); + const url = URL.createObjectURL(blob); + const a = document.createElement('a'); + a.href = url; + a.download = `extraction-results-${taskId}.xlsx`; + a.click(); + URL.revokeObjectURL(url); + message.success('导出成功'); + } catch (err: any) { + message.error(err.message || '导出失败'); + } + }; + + if (!taskId) { + return
缺少 taskId 参数
; + } + + const columns = [ + { + title: '#', + width: 50, + render: (_: any, __: any, index: number) => index + 1, + }, + { + title: '文件名 / Study ID', + key: 'filename', + ellipsis: true, + render: (_: any, record: any) => { + const studyId = getExtractedField(record.extractedData, 'metadata', 'study_id'); + return ( +
+
+ {studyId || record.snapshotFilename} +
+ {studyId && ( +
{record.snapshotFilename}
+ )} +
+ ); + }, + }, + { + title: '解析流程', + key: 'parseFlow', + width: 140, + render: () => ( + + MinerU + DeepSeek + + ), + }, + { + title: '提取状态', + dataIndex: 'status', + width: 100, + render: (status: string) => { + const map: Record = { + completed: { color: 'green', text: '已完成' }, + error: { color: 'red', text: '失败' }, + extracting: { color: 'blue', text: '提取中' }, + pending: { color: 'default', text: '等待中' }, + }; + const { color, text } = map[status] || { color: 'default', text: status }; + return {text}; + }, + }, + { + title: '审核状态', + key: 'reviewStatus', + width: 110, + render: (_: any, record: any) => ( + + ), + }, + { + title: '操作', + key: 'action', + width: 100, + render: (_: any, record: any) => { + if (record.status !== 'completed') return -; + const isApproved = record.reviewStatus === 'approved'; + return ( + + ); + }, + }, + { + title: '错误信息', + dataIndex: 'errorMessage', + width: 180, + ellipsis: true, + render: (v: any) => + v ? {v} : '-', + }, + ]; + + return ( +
+ {/* Info Banner */} +
+ + + + 提取结果审核工作台 + + + + + + +
+ + {/* Summary banner */} + {taskStatus && ( + 0 ? 'warning' : 'success'} + icon={errorCount > 0 ? : } + message={ + + 机器提取完毕!共 {results.length} 篇, + {approvedCount} 已审核 + {pendingCount} 待审核 + {errorCount > 0 && {errorCount} 失败} + + } + showIcon + className="mb-4" + /> + )} + +
50 ? { pageSize: 50 } : false} + size="middle" + scroll={{ x: 900 }} + /> + + + + ); +}; + +export default ExtractionWorkbench; diff --git a/r-statistics-service/tools/.Rhistory b/r-statistics-service/tools/.Rhistory new file mode 100644 index 00000000..6520b0eb --- /dev/null +++ b/r-statistics-service/tools/.Rhistory @@ -0,0 +1,76 @@ +# ======================================== +# 步骤 1: 描述性统计 +# ======================================== +# SSA-Pro 自动生成代码 +# 工具: 描述性统计 +# 时间: 2026-02-25 07:58:34.356454 +# ================================ +library(ggplot2) +# 数据准备 +df <- read.csv("E:/test.csv") +# 数值变量描述性统计 +numeric_vars <- sapply(df, is.numeric) +if (any(numeric_vars)) { +print(summary(df[, numeric_vars, drop = FALSE])) +} +# 分类变量频数表 +categorical_vars <- !numeric_vars +if (any(categorical_vars)) { +for (v in names(df)[categorical_vars]) { +cat("\n变量:", v, "\n") +print(table(df[[v]], useNA = "ifany")) +} +} +# ======== 可视化 ======== +# Bar chart: Yqol +p_Yqol <- ggplot(df[!is.na(df[["Yqol"]]), ], aes(x = factor(.data[["Yqol"]]))) + +geom_bar(fill = "#3b82f6", alpha = 0.7) + +labs(title = "Frequency of Yqol", x = "Yqol", y = "Count") + +theme_minimal() + +theme(axis.text.x = element_text(angle = 45, hjust = 1)) +print(p_Yqol) +# Bar chart: sex +p_sex <- ggplot(df[!is.na(df[["sex"]]), ], aes(x = factor(.data[["sex"]]))) + +geom_bar(fill = "#3b82f6", alpha = 0.7) + +labs(title = "Frequency of sex", x = "sex", y = "Count") + +theme_minimal() + +theme(axis.text.x = element_text(angle = 45, hjust = 1)) +print(p_sex) +# Bar chart: smoke +p_smoke <- ggplot(df[!is.na(df[["smoke"]]), ], aes(x = factor(.data[["smoke"]]))) + +geom_bar(fill = "#3b82f6", alpha = 0.7) + +labs(title = "Frequency of smoke", x = "smoke", y = "Count") + +theme_minimal() + +theme(axis.text.x = element_text(angle = 45, hjust = 1)) +print(p_smoke) +# Histogram: age +p_age <- ggplot(df[!is.na(df[["age"]]), ], aes(x = .data[["age"]])) + +geom_histogram(fill = "#3b82f6", alpha = 0.7, bins = 30) + +labs(title = "Distribution of age", x = "age", y = "Count") + +theme_minimal() +print(p_age) +# ======================================== +# 步骤 2: 二元Logistic回归 +# ======================================== +# SSA-Pro 自动生成代码 +# 工具: 二元 Logistic 回归 +# 时间: 2026-02-25 07:58:34.813076 +# ================================ +# 数据准备 +df <- read.csv("E:/test.csv") +# 模型拟合 +model <- glm(Yqol ~ sex + smoke + age + bmi + mouth_open + bucal_relax + toot_morph + root_number + root_curve + lenspace + denseratio + Pglevel + Pgverti + Winter + presyp + flap + operation + time + surgage + times, data = df, family = binomial(link = "logit")) +summary(model) +# OR 和 95% CI +coef_summary <- summary(model)$coefficients +OR <- exp(coef_summary[, "Estimate"]) +CI_lower <- exp(coef_summary[, "Estimate"] - 1.96 * coef_summary[, "Std. Error"]) +CI_upper <- exp(coef_summary[, "Estimate"] + 1.96 * coef_summary[, "Std. Error"]) +results <- data.frame(OR = OR, CI_lower = CI_lower, CI_upper = CI_upper, +p_value = coef_summary[, "Pr(>|z|)"]) +print(round(results, 3)) +# 模型拟合度 +cat("AIC:", AIC(model), "\n") +# VIF(需要 car 包) +# library(car) +# vif(model)