Files
AIclinicalresearch/backend/prisma/migrations/20260225_add_extraction_template_engine/migration.sql
HaHafeng f0736dbca1 feat(asl/extraction): Complete Tool 3 M1+M2 - skeleton pipeline and HITL workbench
M1 Skeleton Pipeline:
- Scatter-dispatch + Aggregator polling pattern (PgBoss)
- PKB ACL bridge (PkbBridgeService -> PkbExportService DTOs)
- ExtractionSingleWorker with DeepSeek-V3 LLM extraction
- PermanentExtractionError for non-retryable failures
- Phantom Retry Guard (idempotent worker)
- 3-step minimal frontend (Setup -> Progress -> Workbench)
- 4 new DB tables (extraction_templates, project_templates, tasks, results)
- 3 system templates seed (RCT, Cohort, QC)
- M1 integration test suite

M2 HITL Workbench:
- MinerU VLM integration for high-fidelity table extraction
- XML-isolated DynamicPromptBuilder with flat JSON output template
- fuzzyQuoteMatch validator (3-tier confidence scoring)
- SSE real-time logging via ExtractionEventBus
- Schema-driven ExtractionDrawer (dynamic field rendering from template)
- Excel wide-table export with flattenModuleData normalization
- M2 integration test suite

Critical Fixes (data normalization):
- DynamicPromptBuilder: explicit flat key-value output format with example
- ExtractionExcelExporter: handle both array and flat data formats
- ExtractionDrawer: schema-driven rendering instead of hardcoded fields
- ExtractionValidator: array-format quote verification support
- SSE route: Fastify register encapsulation to bypass auth for EventSource
- LLM JSON sanitizer: strip illegal control chars before JSON.parse

Also includes: RVW stats verification spec, SSA expert config guide

Tested: M1 pipeline test + M2 HITL test + manual frontend verification
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-25 18:29:20 +08:00

92 lines
4.4 KiB
SQL

-- ASL Tool 3: Full-text Smart Extraction Workbench V2.0
-- Architecture: Scatter-dispatch + Independent Worker + Aggregator polling reconciliation
-- 4 new tables in asl_schema
-- CreateTable: System extraction templates (RCT / Cohort / QC)
CREATE TABLE "asl_schema"."extraction_templates" (
"id" TEXT NOT NULL,
"code" TEXT NOT NULL,
"name" TEXT NOT NULL,
"description" TEXT,
"baseFields" JSONB NOT NULL,
"is_system" BOOLEAN NOT NULL DEFAULT true,
"created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updated_at" TIMESTAMP(3) NOT NULL,
CONSTRAINT "extraction_templates_pkey" PRIMARY KEY ("id")
);
-- CreateTable: Project-level templates (cloned from system + custom fields)
CREATE TABLE "asl_schema"."extraction_project_templates" (
"id" TEXT NOT NULL,
"project_id" TEXT NOT NULL,
"user_id" TEXT NOT NULL,
"base_template_id" TEXT NOT NULL,
"outcome_type" TEXT NOT NULL DEFAULT 'survival',
"custom_fields" JSONB NOT NULL DEFAULT '[]',
"is_locked" BOOLEAN NOT NULL DEFAULT false,
"created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updated_at" TIMESTAMP(3) NOT NULL,
CONSTRAINT "extraction_project_templates_pkey" PRIMARY KEY ("id")
);
-- CreateTable: Extraction tasks (1 task = batch extract N documents)
CREATE TABLE "asl_schema"."extraction_tasks" (
"id" TEXT NOT NULL,
"project_id" TEXT NOT NULL,
"user_id" TEXT NOT NULL,
"project_template_id" TEXT NOT NULL,
"pkb_knowledge_base_id" TEXT NOT NULL,
"idempotency_key" TEXT,
"total_count" INTEGER NOT NULL,
"status" TEXT NOT NULL DEFAULT 'processing',
"created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updated_at" TIMESTAMP(3) NOT NULL,
"completed_at" TIMESTAMP(3),
CONSTRAINT "extraction_tasks_pkey" PRIMARY KEY ("id")
);
-- CreateTable: Per-document extraction results (Worker only writes its own row)
CREATE TABLE "asl_schema"."extraction_results" (
"id" TEXT NOT NULL,
"task_id" TEXT NOT NULL,
"project_id" TEXT NOT NULL,
"pkb_document_id" TEXT NOT NULL,
"snapshot_storage_key" TEXT NOT NULL,
"snapshot_filename" TEXT NOT NULL,
"status" TEXT NOT NULL DEFAULT 'pending',
"extracted_data" JSONB,
"quote_verification" JSONB,
"manual_overrides" JSONB,
"review_status" TEXT NOT NULL DEFAULT 'pending',
"reviewed_at" TIMESTAMP(3),
"error_message" TEXT,
"processed_at" TIMESTAMP(3),
"created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
"updated_at" TIMESTAMP(3) NOT NULL,
CONSTRAINT "extraction_results_pkey" PRIMARY KEY ("id")
);
-- Unique indexes
CREATE UNIQUE INDEX "extraction_templates_code_key" ON "asl_schema"."extraction_templates"("code");
CREATE UNIQUE INDEX "extraction_tasks_idempotency_key_key" ON "asl_schema"."extraction_tasks"("idempotency_key");
CREATE UNIQUE INDEX "unique_extraction_project_base_template" ON "asl_schema"."extraction_project_templates"("project_id", "base_template_id");
-- Performance indexes
CREATE INDEX "idx_extraction_project_templates_project_id" ON "asl_schema"."extraction_project_templates"("project_id");
CREATE INDEX "idx_extraction_project_templates_user_id" ON "asl_schema"."extraction_project_templates"("user_id");
CREATE INDEX "idx_extraction_tasks_project_id" ON "asl_schema"."extraction_tasks"("project_id");
CREATE INDEX "idx_extraction_tasks_user_id" ON "asl_schema"."extraction_tasks"("user_id");
CREATE INDEX "idx_extraction_tasks_status" ON "asl_schema"."extraction_tasks"("status");
CREATE INDEX "idx_extraction_results_task_status" ON "asl_schema"."extraction_results"("task_id", "status");
CREATE INDEX "idx_extraction_results_task_id" ON "asl_schema"."extraction_results"("task_id");
CREATE INDEX "idx_extraction_results_project_id" ON "asl_schema"."extraction_results"("project_id");
-- Foreign keys
ALTER TABLE "asl_schema"."extraction_project_templates" ADD CONSTRAINT "extraction_project_templates_base_template_id_fkey" FOREIGN KEY ("base_template_id") REFERENCES "asl_schema"."extraction_templates"("id") ON DELETE RESTRICT ON UPDATE CASCADE;
ALTER TABLE "asl_schema"."extraction_tasks" ADD CONSTRAINT "extraction_tasks_project_template_id_fkey" FOREIGN KEY ("project_template_id") REFERENCES "asl_schema"."extraction_project_templates"("id") ON DELETE RESTRICT ON UPDATE CASCADE;
ALTER TABLE "asl_schema"."extraction_results" ADD CONSTRAINT "extraction_results_task_id_fkey" FOREIGN KEY ("task_id") REFERENCES "asl_schema"."extraction_tasks"("id") ON DELETE CASCADE ON UPDATE CASCADE;