feat(asl/extraction): Complete Tool 3 M1+M2 - skeleton pipeline and HITL workbench

M1 Skeleton Pipeline: - Scatter-dispatch + Aggregator polling pattern (PgBoss) - PKB ACL bridge (PkbBridgeService -> PkbExportService DTOs) - ExtractionSingleWorker with DeepSeek-V3 LLM extraction - PermanentExtractionError for non-retryable failures - Phantom Retry Guard (idempotent worker) - 3-step minimal frontend (Setup -> Progress -> Workbench) - 4 new DB tables (extraction_templates, project_templates, tasks, results) - 3 system templates seed (RCT, Cohort, QC) - M1 integration test suite M2 HITL Workbench: - MinerU VLM integration for high-fidelity table extraction - XML-isolated DynamicPromptBuilder with flat JSON output template - fuzzyQuoteMatch validator (3-tier confidence scoring) - SSE real-time logging via ExtractionEventBus - Schema-driven ExtractionDrawer (dynamic field rendering from template) - Excel wide-table export with flattenModuleData normalization - M2 integration test suite Critical Fixes (data normalization): - DynamicPromptBuilder: explicit flat key-value output format with example - ExtractionExcelExporter: handle both array and flat data formats - ExtractionDrawer: schema-driven rendering instead of hardcoded fields - ExtractionValidator: array-format quote verification support - SSE route: Fastify register encapsulation to bypass auth for EventSource - LLM JSON sanitizer: strip illegal control chars before JSON.parse Also includes: RVW stats verification spec, SSA expert config guide Tested: M1 pipeline test + M2 HITL test + manual frontend verification Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-25 18:29:20 +08:00
parent 371fa53956
commit f0736dbca1
40 changed files with 6138 additions and 48 deletions
--- a/backend/prisma/migrations/20260225_add_extraction_template_engine/migration.sql
+++ b/backend/prisma/migrations/20260225_add_extraction_template_engine/migration.sql
@@ -0,0 +1,91 @@
+-- ASL Tool 3: Full-text Smart Extraction Workbench V2.0
+-- Architecture: Scatter-dispatch + Independent Worker + Aggregator polling reconciliation
+-- 4 new tables in asl_schema
+
+-- CreateTable: System extraction templates (RCT / Cohort / QC)
+CREATE TABLE "asl_schema"."extraction_templates" (
+    "id" TEXT NOT NULL,
+    "code" TEXT NOT NULL,
+    "name" TEXT NOT NULL,
+    "description" TEXT,
+    "baseFields" JSONB NOT NULL,
+    "is_system" BOOLEAN NOT NULL DEFAULT true,
+    "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "updated_at" TIMESTAMP(3) NOT NULL,
+
+    CONSTRAINT "extraction_templates_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateTable: Project-level templates (cloned from system + custom fields)
+CREATE TABLE "asl_schema"."extraction_project_templates" (
+    "id" TEXT NOT NULL,
+    "project_id" TEXT NOT NULL,
+    "user_id" TEXT NOT NULL,
+    "base_template_id" TEXT NOT NULL,
+    "outcome_type" TEXT NOT NULL DEFAULT 'survival',
+    "custom_fields" JSONB NOT NULL DEFAULT '[]',
+    "is_locked" BOOLEAN NOT NULL DEFAULT false,
+    "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "updated_at" TIMESTAMP(3) NOT NULL,
+
+    CONSTRAINT "extraction_project_templates_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateTable: Extraction tasks (1 task = batch extract N documents)
+CREATE TABLE "asl_schema"."extraction_tasks" (
+    "id" TEXT NOT NULL,
+    "project_id" TEXT NOT NULL,
+    "user_id" TEXT NOT NULL,
+    "project_template_id" TEXT NOT NULL,
+    "pkb_knowledge_base_id" TEXT NOT NULL,
+    "idempotency_key" TEXT,
+    "total_count" INTEGER NOT NULL,
+    "status" TEXT NOT NULL DEFAULT 'processing',
+    "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "updated_at" TIMESTAMP(3) NOT NULL,
+    "completed_at" TIMESTAMP(3),
+
+    CONSTRAINT "extraction_tasks_pkey" PRIMARY KEY ("id")
+);
+
+-- CreateTable: Per-document extraction results (Worker only writes its own row)
+CREATE TABLE "asl_schema"."extraction_results" (
+    "id" TEXT NOT NULL,
+    "task_id" TEXT NOT NULL,
+    "project_id" TEXT NOT NULL,
+    "pkb_document_id" TEXT NOT NULL,
+    "snapshot_storage_key" TEXT NOT NULL,
+    "snapshot_filename" TEXT NOT NULL,
+    "status" TEXT NOT NULL DEFAULT 'pending',
+    "extracted_data" JSONB,
+    "quote_verification" JSONB,
+    "manual_overrides" JSONB,
+    "review_status" TEXT NOT NULL DEFAULT 'pending',
+    "reviewed_at" TIMESTAMP(3),
+    "error_message" TEXT,
+    "processed_at" TIMESTAMP(3),
+    "created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    "updated_at" TIMESTAMP(3) NOT NULL,
+
+    CONSTRAINT "extraction_results_pkey" PRIMARY KEY ("id")
+);
+
+-- Unique indexes
+CREATE UNIQUE INDEX "extraction_templates_code_key" ON "asl_schema"."extraction_templates"("code");
+CREATE UNIQUE INDEX "extraction_tasks_idempotency_key_key" ON "asl_schema"."extraction_tasks"("idempotency_key");
+CREATE UNIQUE INDEX "unique_extraction_project_base_template" ON "asl_schema"."extraction_project_templates"("project_id", "base_template_id");
+
+-- Performance indexes
+CREATE INDEX "idx_extraction_project_templates_project_id" ON "asl_schema"."extraction_project_templates"("project_id");
+CREATE INDEX "idx_extraction_project_templates_user_id" ON "asl_schema"."extraction_project_templates"("user_id");
+CREATE INDEX "idx_extraction_tasks_project_id" ON "asl_schema"."extraction_tasks"("project_id");
+CREATE INDEX "idx_extraction_tasks_user_id" ON "asl_schema"."extraction_tasks"("user_id");
+CREATE INDEX "idx_extraction_tasks_status" ON "asl_schema"."extraction_tasks"("status");
+CREATE INDEX "idx_extraction_results_task_status" ON "asl_schema"."extraction_results"("task_id", "status");
+CREATE INDEX "idx_extraction_results_task_id" ON "asl_schema"."extraction_results"("task_id");
+CREATE INDEX "idx_extraction_results_project_id" ON "asl_schema"."extraction_results"("project_id");
+
+-- Foreign keys
+ALTER TABLE "asl_schema"."extraction_project_templates" ADD CONSTRAINT "extraction_project_templates_base_template_id_fkey" FOREIGN KEY ("base_template_id") REFERENCES "asl_schema"."extraction_templates"("id") ON DELETE RESTRICT ON UPDATE CASCADE;
+ALTER TABLE "asl_schema"."extraction_tasks" ADD CONSTRAINT "extraction_tasks_project_template_id_fkey" FOREIGN KEY ("project_template_id") REFERENCES "asl_schema"."extraction_project_templates"("id") ON DELETE RESTRICT ON UPDATE CASCADE;
+ALTER TABLE "asl_schema"."extraction_results" ADD CONSTRAINT "extraction_results_task_id_fkey" FOREIGN KEY ("task_id") REFERENCES "asl_schema"."extraction_tasks"("id") ON DELETE CASCADE ON UPDATE CASCADE;
--- a/backend/prisma/schema.prisma
+++ b/backend/prisma/schema.prisma
@@ -590,6 +590,102 @@ model AslFulltextScreeningResult {
  @@schema("asl_schema")
 }

+// ═══════════════════════════════════════════════════════════════
+// ASL 工具 3：全文智能提取工作台 V2.0
+// 架构：散装派发 + 独立 Worker + Aggregator 轮询收口
+// ═══════════════════════════════════════════════════════════════
+
+/// 系统内置提取模板（RCT / Cohort / QC），管理员维护，用户只读
+model AslExtractionTemplate {
+  id          String   @id @default(uuid())
+  code        String   @unique                // RCT / Cohort / QC
+  name        String                          // 随机对照试验 / 队列研究 / 质量改进
+  description String?
+  baseFields  Json                            // { metadata: [...], baseline: [...], rob: [...], outcomes_survival: [...], ... }
+  isSystem    Boolean  @default(true) @map("is_system")
+  createdAt   DateTime @default(now()) @map("created_at")
+  updatedAt   DateTime @updatedAt @map("updated_at")
+
+  projectTemplates AslProjectTemplate[] @relation("BaseTemplateProjectTemplates")
+
+  @@map("extraction_templates")
+  @@schema("asl_schema")
+}
+
+/// 项目级模板（克隆自系统模板 + 用户自定义字段插槽，M3 启用自定义字段）
+model AslProjectTemplate {
+  id             String   @id @default(uuid())
+  projectId      String   @map("project_id")
+  userId         String   @map("user_id")
+  baseTemplateId String   @map("base_template_id")
+  outcomeType    String   @default("survival") @map("outcome_type") // survival | dichotomous | continuous
+  customFields   Json     @default("[]") @map("custom_fields")      // M3: [{name, type, prompt}]
+  isLocked       Boolean  @default(false) @map("is_locked")
+  createdAt      DateTime @default(now()) @map("created_at")
+  updatedAt      DateTime @updatedAt @map("updated_at")
+
+  baseTemplate AslExtractionTemplate @relation("BaseTemplateProjectTemplates", fields: [baseTemplateId], references: [id])
+  tasks        AslExtractionTask[]   @relation("TemplateExtractionTasks")
+
+  @@unique([projectId, baseTemplateId], map: "unique_extraction_project_base_template")
+  @@index([projectId], map: "idx_extraction_project_templates_project_id")
+  @@index([userId], map: "idx_extraction_project_templates_user_id")
+  @@map("extraction_project_templates")
+  @@schema("asl_schema")
+}
+
+/// 提取任务（1 个任务 = 批量提取 N 篇文献），状态仅由 Aggregator 修改
+model AslExtractionTask {
+  id                 String    @id @default(uuid())
+  projectId          String    @map("project_id")
+  userId             String    @map("user_id")
+  projectTemplateId  String    @map("project_template_id")
+  pkbKnowledgeBaseId String    @map("pkb_knowledge_base_id")
+  idempotencyKey     String?   @unique @map("idempotency_key")
+  totalCount         Int       @map("total_count")
+  status             String    @default("processing") // processing | completed | failed
+  createdAt          DateTime  @default(now()) @map("created_at")
+  updatedAt          DateTime  @updatedAt @map("updated_at")
+  completedAt        DateTime? @map("completed_at")
+
+  projectTemplate AslProjectTemplate    @relation("TemplateExtractionTasks", fields: [projectTemplateId], references: [id])
+  results         AslExtractionResult[] @relation("TaskExtractionResults")
+
+  @@index([projectId], map: "idx_extraction_tasks_project_id")
+  @@index([userId], map: "idx_extraction_tasks_user_id")
+  @@index([status], map: "idx_extraction_tasks_status")
+  @@map("extraction_tasks")
+  @@schema("asl_schema")
+}
+
+/// 单篇文献提取结果，Worker 只写自己的 Result 行，绝不碰 Task 表
+model AslExtractionResult {
+  id                 String    @id @default(uuid())
+  taskId             String    @map("task_id")
+  projectId          String    @map("project_id")
+  pkbDocumentId      String    @map("pkb_document_id")
+  snapshotStorageKey String    @map("snapshot_storage_key") // API 层冻结的 PKB OSS 路径
+  snapshotFilename   String    @map("snapshot_filename")    // API 层冻结的原始文件名
+  status             String    @default("pending")          // pending | extracting | completed | error
+  extractedData      Json?     @map("extracted_data")       // LLM 结构化提取 JSON
+  quoteVerification  Json?     @map("quote_verification")   // fuzzyQuoteMatch 三级置信度结果
+  manualOverrides    Json?     @map("manual_overrides")     // HITL 人工修改记录（M2）
+  reviewStatus       String    @default("pending") @map("review_status") // pending | approved
+  reviewedAt         DateTime? @map("reviewed_at")
+  errorMessage       String?   @map("error_message")
+  processedAt        DateTime? @map("processed_at")
+  createdAt          DateTime  @default(now()) @map("created_at")
+  updatedAt          DateTime  @updatedAt @map("updated_at")
+
+  task AslExtractionTask @relation("TaskExtractionResults", fields: [taskId], references: [id], onDelete: Cascade)
+
+  @@index([taskId, status], map: "idx_extraction_results_task_status") // Aggregator groupBy 性能保障
+  @@index([taskId], map: "idx_extraction_results_task_id")
+  @@index([projectId], map: "idx_extraction_results_project_id")
+  @@map("extraction_results")
+  @@schema("asl_schema")
+}
+
 model DCHealthCheck {
  id              String   @id @default(uuid())
  userId          String   @map("user_id")
--- a/backend/prisma/seed-extraction-templates.ts
+++ b/backend/prisma/seed-extraction-templates.ts
@@ -0,0 +1,182 @@
+import { PrismaClient } from '@prisma/client';
+
+const prisma = new PrismaClient();
+
+/**
+ * ASL 工具 3 系统内置提取模板 Seed
+ * 3 套模板：RCT / Cohort / QC
+ * 字段定义来自《ASL 工具 3 全文提取数据字典与规范》
+ */
+const SYSTEM_TEMPLATES = [
+  {
+    code: 'RCT',
+    name: '随机对照试验 (RCT)',
+    description: '适用于随机对照试验文献的结构化数据提取，包含基线特征、RoB 2.0 偏倚风险评估和多种结局指标类型',
+    baseFields: {
+      metadata: [
+        { key: 'study_id', label: '研究标识', type: 'string', description: '第一作者+年份，如 Gandhi 2018' },
+        { key: 'nct_number', label: '临床试验注册号', type: 'string', description: 'ClinicalTrials.gov 注册号' },
+        { key: 'study_design', label: '研究设计类型', type: 'string', description: '如 RCT, Phase III RCT' },
+        { key: 'funding_source', label: '资金来源', type: 'string', description: '资助方与利益冲突声明' },
+      ],
+      baseline: [
+        { key: 'treatment_name', label: '实验组干预', type: 'string', description: '含剂量/频次' },
+        { key: 'control_name', label: '对照组干预', type: 'string', description: '如 Placebo' },
+        { key: 'n_treatment', label: '实验组样本量', type: 'integer', description: 'Table 1 中 N=xxx' },
+        { key: 'n_control', label: '对照组样本量', type: 'integer', description: 'Table 1 中 N=xxx' },
+        { key: 'age_treatment', label: '实验组年龄', type: 'string', description: 'Mean±SD 或 Median(IQR)' },
+        { key: 'age_control', label: '对照组年龄', type: 'string', description: 'Mean±SD 或 Median(IQR)' },
+        { key: 'male_percent', label: '男性比例(%)', type: 'string', description: '整体或分组' },
+      ],
+      rob: [
+        { key: 'rob_randomization', label: '随机序列产生', type: 'string', description: 'Low/High/Unclear Risk' },
+        { key: 'rob_allocation', label: '分配隐藏', type: 'string', description: 'Low/High/Unclear Risk' },
+        { key: 'rob_blinding', label: '盲法实施', type: 'string', description: 'Low/High/Unclear Risk' },
+        { key: 'rob_attrition', label: '失访与数据完整性', type: 'string', description: 'Low/High/Unclear Risk' },
+      ],
+      outcomes_survival: [
+        { key: 'endpoint_name', label: '终点名称', type: 'string', description: '如 OS, PFS, MACE' },
+        { key: 'hr_value', label: '风险比 (HR)', type: 'number', description: 'Hazard Ratio' },
+        { key: 'hr_ci_lower', label: '95%CI 下限', type: 'number', description: '' },
+        { key: 'hr_ci_upper', label: '95%CI 上限', type: 'number', description: '' },
+        { key: 'p_value', label: 'P 值', type: 'string', description: '如 <0.001 或 0.032' },
+      ],
+      outcomes_dichotomous: [
+        { key: 'event_treatment', label: '实验组事件数', type: 'integer', description: '发生事件的具体人数' },
+        { key: 'total_treatment', label: '实验组分析总人数', type: 'integer', description: '可能与基线总人数不同' },
+        { key: 'event_control', label: '对照组事件数', type: 'integer', description: '' },
+        { key: 'total_control', label: '对照组分析总人数', type: 'integer', description: '' },
+      ],
+      outcomes_continuous: [
+        { key: 'mean_treatment', label: '实验组均值', type: 'number', description: '' },
+        { key: 'sd_treatment', label: '实验组标准差', type: 'number', description: 'SD，若原文为 SE/CI 需换算' },
+        { key: 'n_treatment_outcome', label: '实验组分析人数', type: 'integer', description: '' },
+        { key: 'mean_control', label: '对照组均值', type: 'number', description: '' },
+        { key: 'sd_control', label: '对照组标准差', type: 'number', description: '' },
+        { key: 'n_control_outcome', label: '对照组分析人数', type: 'integer', description: '' },
+      ],
+    },
+  },
+  {
+    code: 'Cohort',
+    name: '队列研究 (Cohort)',
+    description: '适用于前瞻性/回顾性队列研究，基线特征与 RCT 类似但无随机化相关偏倚评估',
+    baseFields: {
+      metadata: [
+        { key: 'study_id', label: '研究标识', type: 'string', description: '第一作者+年份' },
+        { key: 'study_design', label: '研究设计类型', type: 'string', description: '如 Prospective Cohort, Retrospective Cohort' },
+        { key: 'funding_source', label: '资金来源', type: 'string', description: '' },
+        { key: 'follow_up_duration', label: '随访时长', type: 'string', description: '如 Median 5.2 years' },
+      ],
+      baseline: [
+        { key: 'exposure_group', label: '暴露组', type: 'string', description: '暴露因素描述' },
+        { key: 'control_group', label: '非暴露组/对照组', type: 'string', description: '' },
+        { key: 'n_exposure', label: '暴露组样本量', type: 'integer', description: '' },
+        { key: 'n_control', label: '对照组样本量', type: 'integer', description: '' },
+        { key: 'age_exposure', label: '暴露组年龄', type: 'string', description: '' },
+        { key: 'age_control', label: '对照组年龄', type: 'string', description: '' },
+        { key: 'male_percent', label: '男性比例(%)', type: 'string', description: '' },
+      ],
+      rob: [
+        { key: 'rob_selection', label: '选择偏倚', type: 'string', description: 'NOS: 代表性、非暴露组选择、暴露确定' },
+        { key: 'rob_comparability', label: '可比性', type: 'string', description: 'NOS: 混杂因素控制' },
+        { key: 'rob_outcome', label: '结局评估', type: 'string', description: 'NOS: 结局评估、随访充分性' },
+      ],
+      outcomes_survival: [
+        { key: 'endpoint_name', label: '终点名称', type: 'string', description: '' },
+        { key: 'hr_value', label: '风险比 (HR)', type: 'number', description: '调整后 HR' },
+        { key: 'hr_ci_lower', label: '95%CI 下限', type: 'number', description: '' },
+        { key: 'hr_ci_upper', label: '95%CI 上限', type: 'number', description: '' },
+        { key: 'p_value', label: 'P 值', type: 'string', description: '' },
+      ],
+      outcomes_dichotomous: [
+        { key: 'event_treatment', label: '暴露组事件数', type: 'integer', description: '' },
+        { key: 'total_treatment', label: '暴露组总人数', type: 'integer', description: '' },
+        { key: 'event_control', label: '对照组事件数', type: 'integer', description: '' },
+        { key: 'total_control', label: '对照组总人数', type: 'integer', description: '' },
+      ],
+      outcomes_continuous: [
+        { key: 'mean_treatment', label: '暴露组均值', type: 'number', description: '' },
+        { key: 'sd_treatment', label: '暴露组标准差', type: 'number', description: '' },
+        { key: 'n_treatment_outcome', label: '暴露组分析人数', type: 'integer', description: '' },
+        { key: 'mean_control', label: '对照组均值', type: 'number', description: '' },
+        { key: 'sd_control', label: '对照组标准差', type: 'number', description: '' },
+        { key: 'n_control_outcome', label: '对照组分析人数', type: 'integer', description: '' },
+      ],
+    },
+  },
+  {
+    code: 'QC',
+    name: '质量改进研究 (QI/QC)',
+    description: '适用于质量改进研究，关注干预前后的指标变化，偏倚评估采用 ROBINS-I 简化版',
+    baseFields: {
+      metadata: [
+        { key: 'study_id', label: '研究标识', type: 'string', description: '第一作者+年份' },
+        { key: 'study_design', label: '研究设计类型', type: 'string', description: '如 Before-After, ITS' },
+        { key: 'setting', label: '研究场景', type: 'string', description: '如 ICU, Emergency Department' },
+        { key: 'funding_source', label: '资金来源', type: 'string', description: '' },
+      ],
+      baseline: [
+        { key: 'intervention_name', label: 'QI 干预措施', type: 'string', description: '质量改进措施描述' },
+        { key: 'comparator', label: '对照/基线', type: 'string', description: '如 Pre-intervention period' },
+        { key: 'n_intervention', label: '干预组样本量', type: 'integer', description: '' },
+        { key: 'n_comparator', label: '对照组样本量', type: 'integer', description: '' },
+        { key: 'duration_pre', label: '干预前观察期', type: 'string', description: '' },
+        { key: 'duration_post', label: '干预后观察期', type: 'string', description: '' },
+      ],
+      rob: [
+        { key: 'rob_confounding', label: '混杂偏倚', type: 'string', description: 'ROBINS-I 简化' },
+        { key: 'rob_measurement', label: '测量偏倚', type: 'string', description: '结局指标测量方法是否一致' },
+        { key: 'rob_reporting', label: '报告偏倚', type: 'string', description: '是否选择性报告' },
+      ],
+      outcomes_dichotomous: [
+        { key: 'event_treatment', label: '干预后事件数', type: 'integer', description: '' },
+        { key: 'total_treatment', label: '干预后总人数', type: 'integer', description: '' },
+        { key: 'event_control', label: '干预前事件数', type: 'integer', description: '' },
+        { key: 'total_control', label: '干预前总人数', type: 'integer', description: '' },
+      ],
+      outcomes_continuous: [
+        { key: 'mean_treatment', label: '干预后均值', type: 'number', description: '' },
+        { key: 'sd_treatment', label: '干预后标准差', type: 'number', description: '' },
+        { key: 'n_treatment_outcome', label: '干预后分析人数', type: 'integer', description: '' },
+        { key: 'mean_control', label: '干预前均值', type: 'number', description: '' },
+        { key: 'sd_control', label: '干预前标准差', type: 'number', description: '' },
+        { key: 'n_control_outcome', label: '干预前分析人数', type: 'integer', description: '' },
+      ],
+    },
+  },
+];
+
+async function main() {
+  console.log('🌱 ASL 工具 3：注入系统内置提取模板...\n');
+
+  for (const template of SYSTEM_TEMPLATES) {
+    const result = await prisma.aslExtractionTemplate.upsert({
+      where: { code: template.code },
+      update: {
+        name: template.name,
+        description: template.description,
+        baseFields: template.baseFields,
+      },
+      create: {
+        code: template.code,
+        name: template.name,
+        description: template.description,
+        baseFields: template.baseFields,
+        isSystem: true,
+      },
+    });
+    console.log(`   ✅ ${result.code}: ${result.name}`);
+  }
+
+  const count = await prisma.aslExtractionTemplate.count();
+  console.log(`\n🎉 完成！共 ${count} 套系统模板。`);
+}
+
+main()
+  .then(() => prisma.$disconnect())
+  .catch(async (e) => {
+    console.error('❌ Seed 失败:', e);
+    await prisma.$disconnect();
+    process.exit(1);
+  });