feat(asl/extraction): Complete Tool 3 M1+M2 - skeleton pipeline and HITL workbench
M1 Skeleton Pipeline: - Scatter-dispatch + Aggregator polling pattern (PgBoss) - PKB ACL bridge (PkbBridgeService -> PkbExportService DTOs) - ExtractionSingleWorker with DeepSeek-V3 LLM extraction - PermanentExtractionError for non-retryable failures - Phantom Retry Guard (idempotent worker) - 3-step minimal frontend (Setup -> Progress -> Workbench) - 4 new DB tables (extraction_templates, project_templates, tasks, results) - 3 system templates seed (RCT, Cohort, QC) - M1 integration test suite M2 HITL Workbench: - MinerU VLM integration for high-fidelity table extraction - XML-isolated DynamicPromptBuilder with flat JSON output template - fuzzyQuoteMatch validator (3-tier confidence scoring) - SSE real-time logging via ExtractionEventBus - Schema-driven ExtractionDrawer (dynamic field rendering from template) - Excel wide-table export with flattenModuleData normalization - M2 integration test suite Critical Fixes (data normalization): - DynamicPromptBuilder: explicit flat key-value output format with example - ExtractionExcelExporter: handle both array and flat data formats - ExtractionDrawer: schema-driven rendering instead of hardcoded fields - ExtractionValidator: array-format quote verification support - SSE route: Fastify register encapsulation to bypass auth for EventSource - LLM JSON sanitizer: strip illegal control chars before JSON.parse Also includes: RVW stats verification spec, SSA expert config guide Tested: M1 pipeline test + M2 HITL test + manual frontend verification Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -0,0 +1,91 @@
|
||||
-- ASL Tool 3: Full-text Smart Extraction Workbench V2.0
|
||||
-- Architecture: Scatter-dispatch + Independent Worker + Aggregator polling reconciliation
|
||||
-- 4 new tables in asl_schema
|
||||
|
||||
-- CreateTable: System extraction templates (RCT / Cohort / QC)
|
||||
CREATE TABLE "asl_schema"."extraction_templates" (
|
||||
"id" TEXT NOT NULL,
|
||||
"code" TEXT NOT NULL,
|
||||
"name" TEXT NOT NULL,
|
||||
"description" TEXT,
|
||||
"baseFields" JSONB NOT NULL,
|
||||
"is_system" BOOLEAN NOT NULL DEFAULT true,
|
||||
"created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
"updated_at" TIMESTAMP(3) NOT NULL,
|
||||
|
||||
CONSTRAINT "extraction_templates_pkey" PRIMARY KEY ("id")
|
||||
);
|
||||
|
||||
-- CreateTable: Project-level templates (cloned from system + custom fields)
|
||||
CREATE TABLE "asl_schema"."extraction_project_templates" (
|
||||
"id" TEXT NOT NULL,
|
||||
"project_id" TEXT NOT NULL,
|
||||
"user_id" TEXT NOT NULL,
|
||||
"base_template_id" TEXT NOT NULL,
|
||||
"outcome_type" TEXT NOT NULL DEFAULT 'survival',
|
||||
"custom_fields" JSONB NOT NULL DEFAULT '[]',
|
||||
"is_locked" BOOLEAN NOT NULL DEFAULT false,
|
||||
"created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
"updated_at" TIMESTAMP(3) NOT NULL,
|
||||
|
||||
CONSTRAINT "extraction_project_templates_pkey" PRIMARY KEY ("id")
|
||||
);
|
||||
|
||||
-- CreateTable: Extraction tasks (1 task = batch extract N documents)
|
||||
CREATE TABLE "asl_schema"."extraction_tasks" (
|
||||
"id" TEXT NOT NULL,
|
||||
"project_id" TEXT NOT NULL,
|
||||
"user_id" TEXT NOT NULL,
|
||||
"project_template_id" TEXT NOT NULL,
|
||||
"pkb_knowledge_base_id" TEXT NOT NULL,
|
||||
"idempotency_key" TEXT,
|
||||
"total_count" INTEGER NOT NULL,
|
||||
"status" TEXT NOT NULL DEFAULT 'processing',
|
||||
"created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
"updated_at" TIMESTAMP(3) NOT NULL,
|
||||
"completed_at" TIMESTAMP(3),
|
||||
|
||||
CONSTRAINT "extraction_tasks_pkey" PRIMARY KEY ("id")
|
||||
);
|
||||
|
||||
-- CreateTable: Per-document extraction results (Worker only writes its own row)
|
||||
CREATE TABLE "asl_schema"."extraction_results" (
|
||||
"id" TEXT NOT NULL,
|
||||
"task_id" TEXT NOT NULL,
|
||||
"project_id" TEXT NOT NULL,
|
||||
"pkb_document_id" TEXT NOT NULL,
|
||||
"snapshot_storage_key" TEXT NOT NULL,
|
||||
"snapshot_filename" TEXT NOT NULL,
|
||||
"status" TEXT NOT NULL DEFAULT 'pending',
|
||||
"extracted_data" JSONB,
|
||||
"quote_verification" JSONB,
|
||||
"manual_overrides" JSONB,
|
||||
"review_status" TEXT NOT NULL DEFAULT 'pending',
|
||||
"reviewed_at" TIMESTAMP(3),
|
||||
"error_message" TEXT,
|
||||
"processed_at" TIMESTAMP(3),
|
||||
"created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
"updated_at" TIMESTAMP(3) NOT NULL,
|
||||
|
||||
CONSTRAINT "extraction_results_pkey" PRIMARY KEY ("id")
|
||||
);
|
||||
|
||||
-- Unique indexes
|
||||
CREATE UNIQUE INDEX "extraction_templates_code_key" ON "asl_schema"."extraction_templates"("code");
|
||||
CREATE UNIQUE INDEX "extraction_tasks_idempotency_key_key" ON "asl_schema"."extraction_tasks"("idempotency_key");
|
||||
CREATE UNIQUE INDEX "unique_extraction_project_base_template" ON "asl_schema"."extraction_project_templates"("project_id", "base_template_id");
|
||||
|
||||
-- Performance indexes
|
||||
CREATE INDEX "idx_extraction_project_templates_project_id" ON "asl_schema"."extraction_project_templates"("project_id");
|
||||
CREATE INDEX "idx_extraction_project_templates_user_id" ON "asl_schema"."extraction_project_templates"("user_id");
|
||||
CREATE INDEX "idx_extraction_tasks_project_id" ON "asl_schema"."extraction_tasks"("project_id");
|
||||
CREATE INDEX "idx_extraction_tasks_user_id" ON "asl_schema"."extraction_tasks"("user_id");
|
||||
CREATE INDEX "idx_extraction_tasks_status" ON "asl_schema"."extraction_tasks"("status");
|
||||
CREATE INDEX "idx_extraction_results_task_status" ON "asl_schema"."extraction_results"("task_id", "status");
|
||||
CREATE INDEX "idx_extraction_results_task_id" ON "asl_schema"."extraction_results"("task_id");
|
||||
CREATE INDEX "idx_extraction_results_project_id" ON "asl_schema"."extraction_results"("project_id");
|
||||
|
||||
-- Foreign keys
|
||||
ALTER TABLE "asl_schema"."extraction_project_templates" ADD CONSTRAINT "extraction_project_templates_base_template_id_fkey" FOREIGN KEY ("base_template_id") REFERENCES "asl_schema"."extraction_templates"("id") ON DELETE RESTRICT ON UPDATE CASCADE;
|
||||
ALTER TABLE "asl_schema"."extraction_tasks" ADD CONSTRAINT "extraction_tasks_project_template_id_fkey" FOREIGN KEY ("project_template_id") REFERENCES "asl_schema"."extraction_project_templates"("id") ON DELETE RESTRICT ON UPDATE CASCADE;
|
||||
ALTER TABLE "asl_schema"."extraction_results" ADD CONSTRAINT "extraction_results_task_id_fkey" FOREIGN KEY ("task_id") REFERENCES "asl_schema"."extraction_tasks"("id") ON DELETE CASCADE ON UPDATE CASCADE;
|
||||
Reference in New Issue
Block a user