M1 Skeleton Pipeline: - Scatter-dispatch + Aggregator polling pattern (PgBoss) - PKB ACL bridge (PkbBridgeService -> PkbExportService DTOs) - ExtractionSingleWorker with DeepSeek-V3 LLM extraction - PermanentExtractionError for non-retryable failures - Phantom Retry Guard (idempotent worker) - 3-step minimal frontend (Setup -> Progress -> Workbench) - 4 new DB tables (extraction_templates, project_templates, tasks, results) - 3 system templates seed (RCT, Cohort, QC) - M1 integration test suite M2 HITL Workbench: - MinerU VLM integration for high-fidelity table extraction - XML-isolated DynamicPromptBuilder with flat JSON output template - fuzzyQuoteMatch validator (3-tier confidence scoring) - SSE real-time logging via ExtractionEventBus - Schema-driven ExtractionDrawer (dynamic field rendering from template) - Excel wide-table export with flattenModuleData normalization - M2 integration test suite Critical Fixes (data normalization): - DynamicPromptBuilder: explicit flat key-value output format with example - ExtractionExcelExporter: handle both array and flat data formats - ExtractionDrawer: schema-driven rendering instead of hardcoded fields - ExtractionValidator: array-format quote verification support - SSE route: Fastify register encapsulation to bypass auth for EventSource - LLM JSON sanitizer: strip illegal control chars before JSON.parse Also includes: RVW stats verification spec, SSA expert config guide Tested: M1 pipeline test + M2 HITL test + manual frontend verification Co-authored-by: Cursor <cursoragent@cursor.com>
92 lines
4.4 KiB
SQL
92 lines
4.4 KiB
SQL
-- ASL Tool 3: Full-text Smart Extraction Workbench V2.0
|
|
-- Architecture: Scatter-dispatch + Independent Worker + Aggregator polling reconciliation
|
|
-- 4 new tables in asl_schema
|
|
|
|
-- CreateTable: System extraction templates (RCT / Cohort / QC)
|
|
CREATE TABLE "asl_schema"."extraction_templates" (
|
|
"id" TEXT NOT NULL,
|
|
"code" TEXT NOT NULL,
|
|
"name" TEXT NOT NULL,
|
|
"description" TEXT,
|
|
"baseFields" JSONB NOT NULL,
|
|
"is_system" BOOLEAN NOT NULL DEFAULT true,
|
|
"created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
"updated_at" TIMESTAMP(3) NOT NULL,
|
|
|
|
CONSTRAINT "extraction_templates_pkey" PRIMARY KEY ("id")
|
|
);
|
|
|
|
-- CreateTable: Project-level templates (cloned from system + custom fields)
|
|
CREATE TABLE "asl_schema"."extraction_project_templates" (
|
|
"id" TEXT NOT NULL,
|
|
"project_id" TEXT NOT NULL,
|
|
"user_id" TEXT NOT NULL,
|
|
"base_template_id" TEXT NOT NULL,
|
|
"outcome_type" TEXT NOT NULL DEFAULT 'survival',
|
|
"custom_fields" JSONB NOT NULL DEFAULT '[]',
|
|
"is_locked" BOOLEAN NOT NULL DEFAULT false,
|
|
"created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
"updated_at" TIMESTAMP(3) NOT NULL,
|
|
|
|
CONSTRAINT "extraction_project_templates_pkey" PRIMARY KEY ("id")
|
|
);
|
|
|
|
-- CreateTable: Extraction tasks (1 task = batch extract N documents)
|
|
CREATE TABLE "asl_schema"."extraction_tasks" (
|
|
"id" TEXT NOT NULL,
|
|
"project_id" TEXT NOT NULL,
|
|
"user_id" TEXT NOT NULL,
|
|
"project_template_id" TEXT NOT NULL,
|
|
"pkb_knowledge_base_id" TEXT NOT NULL,
|
|
"idempotency_key" TEXT,
|
|
"total_count" INTEGER NOT NULL,
|
|
"status" TEXT NOT NULL DEFAULT 'processing',
|
|
"created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
"updated_at" TIMESTAMP(3) NOT NULL,
|
|
"completed_at" TIMESTAMP(3),
|
|
|
|
CONSTRAINT "extraction_tasks_pkey" PRIMARY KEY ("id")
|
|
);
|
|
|
|
-- CreateTable: Per-document extraction results (Worker only writes its own row)
|
|
CREATE TABLE "asl_schema"."extraction_results" (
|
|
"id" TEXT NOT NULL,
|
|
"task_id" TEXT NOT NULL,
|
|
"project_id" TEXT NOT NULL,
|
|
"pkb_document_id" TEXT NOT NULL,
|
|
"snapshot_storage_key" TEXT NOT NULL,
|
|
"snapshot_filename" TEXT NOT NULL,
|
|
"status" TEXT NOT NULL DEFAULT 'pending',
|
|
"extracted_data" JSONB,
|
|
"quote_verification" JSONB,
|
|
"manual_overrides" JSONB,
|
|
"review_status" TEXT NOT NULL DEFAULT 'pending',
|
|
"reviewed_at" TIMESTAMP(3),
|
|
"error_message" TEXT,
|
|
"processed_at" TIMESTAMP(3),
|
|
"created_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
|
"updated_at" TIMESTAMP(3) NOT NULL,
|
|
|
|
CONSTRAINT "extraction_results_pkey" PRIMARY KEY ("id")
|
|
);
|
|
|
|
-- Unique indexes
|
|
CREATE UNIQUE INDEX "extraction_templates_code_key" ON "asl_schema"."extraction_templates"("code");
|
|
CREATE UNIQUE INDEX "extraction_tasks_idempotency_key_key" ON "asl_schema"."extraction_tasks"("idempotency_key");
|
|
CREATE UNIQUE INDEX "unique_extraction_project_base_template" ON "asl_schema"."extraction_project_templates"("project_id", "base_template_id");
|
|
|
|
-- Performance indexes
|
|
CREATE INDEX "idx_extraction_project_templates_project_id" ON "asl_schema"."extraction_project_templates"("project_id");
|
|
CREATE INDEX "idx_extraction_project_templates_user_id" ON "asl_schema"."extraction_project_templates"("user_id");
|
|
CREATE INDEX "idx_extraction_tasks_project_id" ON "asl_schema"."extraction_tasks"("project_id");
|
|
CREATE INDEX "idx_extraction_tasks_user_id" ON "asl_schema"."extraction_tasks"("user_id");
|
|
CREATE INDEX "idx_extraction_tasks_status" ON "asl_schema"."extraction_tasks"("status");
|
|
CREATE INDEX "idx_extraction_results_task_status" ON "asl_schema"."extraction_results"("task_id", "status");
|
|
CREATE INDEX "idx_extraction_results_task_id" ON "asl_schema"."extraction_results"("task_id");
|
|
CREATE INDEX "idx_extraction_results_project_id" ON "asl_schema"."extraction_results"("project_id");
|
|
|
|
-- Foreign keys
|
|
ALTER TABLE "asl_schema"."extraction_project_templates" ADD CONSTRAINT "extraction_project_templates_base_template_id_fkey" FOREIGN KEY ("base_template_id") REFERENCES "asl_schema"."extraction_templates"("id") ON DELETE RESTRICT ON UPDATE CASCADE;
|
|
ALTER TABLE "asl_schema"."extraction_tasks" ADD CONSTRAINT "extraction_tasks_project_template_id_fkey" FOREIGN KEY ("project_template_id") REFERENCES "asl_schema"."extraction_project_templates"("id") ON DELETE RESTRICT ON UPDATE CASCADE;
|
|
ALTER TABLE "asl_schema"."extraction_results" ADD CONSTRAINT "extraction_results_task_id_fkey" FOREIGN KEY ("task_id") REFERENCES "asl_schema"."extraction_tasks"("id") ON DELETE CASCADE ON UPDATE CASCADE;
|