feat(asl/extraction): Complete Tool 3 M1+M2 - skeleton pipeline and HITL workbench

M1 Skeleton Pipeline: - Scatter-dispatch + Aggregator polling pattern (PgBoss) - PKB ACL bridge (PkbBridgeService -> PkbExportService DTOs) - ExtractionSingleWorker with DeepSeek-V3 LLM extraction - PermanentExtractionError for non-retryable failures - Phantom Retry Guard (idempotent worker) - 3-step minimal frontend (Setup -> Progress -> Workbench) - 4 new DB tables (extraction_templates, project_templates, tasks, results) - 3 system templates seed (RCT, Cohort, QC) - M1 integration test suite M2 HITL Workbench: - MinerU VLM integration for high-fidelity table extraction - XML-isolated DynamicPromptBuilder with flat JSON output template - fuzzyQuoteMatch validator (3-tier confidence scoring) - SSE real-time logging via ExtractionEventBus - Schema-driven ExtractionDrawer (dynamic field rendering from template) - Excel wide-table export with flattenModuleData normalization - M2 integration test suite Critical Fixes (data normalization): - DynamicPromptBuilder: explicit flat key-value output format with example - ExtractionExcelExporter: handle both array and flat data formats - ExtractionDrawer: schema-driven rendering instead of hardcoded fields - ExtractionValidator: array-format quote verification support - SSE route: Fastify register encapsulation to bypass auth for EventSource - LLM JSON sanitizer: strip illegal control chars before JSON.parse Also includes: RVW stats verification spec, SSA expert config guide Tested: M1 pipeline test + M2 HITL test + manual frontend verification Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-25 18:29:20 +08:00
parent 371fa53956
commit f0736dbca1
40 changed files with 6138 additions and 48 deletions
--- a/backend/src/modules/pkb/services/PkbExportService.ts
+++ b/backend/src/modules/pkb/services/PkbExportService.ts
@@ -0,0 +1,96 @@
+import { prisma } from '../../../config/database.js';
+
+/**
+ * PKB 数据导出服务（PKB 模块维护）
+ * 
+ * ACL 防腐层出口：返回纯 DTO 对象，不暴露 Prisma 类型。
+ * 消费方：ASL PkbBridgeService
+ */
+
+export interface PkbDocumentExportDTO {
+  documentId: string;
+  storageKey: string;
+  filename: string;
+  extractedText: string | null;
+  fileSizeBytes: number;
+}
+
+export interface PkbKnowledgeBaseExportDTO {
+  id: string;
+  name: string;
+  fileCount: number;
+}
+
+class PkbExportServiceImpl {
+  async listKnowledgeBases(userId: string): Promise<PkbKnowledgeBaseExportDTO[]> {
+    const kbs = await prisma.knowledgeBase.findMany({
+      where: { userId },
+      select: {
+        id: true,
+        name: true,
+        fileCount: true,
+      },
+      orderBy: { updatedAt: 'desc' },
+    });
+
+    return kbs.map(kb => ({
+      id: kb.id,
+      name: kb.name,
+      fileCount: kb.fileCount,
+    }));
+  }
+
+  async listPdfDocuments(kbId: string): Promise<PkbDocumentExportDTO[]> {
+    const docs = await prisma.document.findMany({
+      where: {
+        kbId,
+        fileType: { in: ['pdf', 'application/pdf'] },
+      },
+      select: {
+        id: true,
+        storageKey: true,
+        filename: true,
+        extractedText: true,
+        fileSizeBytes: true,
+      },
+      orderBy: { uploadedAt: 'desc' },
+    });
+
+    return docs.map(doc => ({
+      documentId: doc.id,
+      storageKey: doc.storageKey || '',
+      filename: doc.filename,
+      extractedText: doc.extractedText,
+      fileSizeBytes: Number(doc.fileSizeBytes || 0),
+    }));
+  }
+
+  async getDocumentForExtraction(documentId: string): Promise<PkbDocumentExportDTO> {
+    const doc = await prisma.document.findUnique({
+      where: { id: documentId },
+      select: {
+        id: true,
+        storageKey: true,
+        filename: true,
+        extractedText: true,
+        fileSizeBytes: true,
+      },
+    });
+
+    if (!doc) {
+      const err = new Error(`PKB Document not found: ${documentId}`);
+      (err as any).name = 'PkbDocumentNotFoundError';
+      throw err;
+    }
+
+    return {
+      documentId: doc.id,
+      storageKey: doc.storageKey || '',
+      filename: doc.filename,
+      extractedText: doc.extractedText,
+      fileSizeBytes: Number(doc.fileSizeBytes || 0),
+    };
+  }
+}
+
+export const pkbExportService = new PkbExportServiceImpl();