feat(asl/extraction): Complete Tool 3 M1+M2 - skeleton pipeline and HITL workbench

M1 Skeleton Pipeline: - Scatter-dispatch + Aggregator polling pattern (PgBoss) - PKB ACL bridge (PkbBridgeService -> PkbExportService DTOs) - ExtractionSingleWorker with DeepSeek-V3 LLM extraction - PermanentExtractionError for non-retryable failures - Phantom Retry Guard (idempotent worker) - 3-step minimal frontend (Setup -> Progress -> Workbench) - 4 new DB tables (extraction_templates, project_templates, tasks, results) - 3 system templates seed (RCT, Cohort, QC) - M1 integration test suite M2 HITL Workbench: - MinerU VLM integration for high-fidelity table extraction - XML-isolated DynamicPromptBuilder with flat JSON output template - fuzzyQuoteMatch validator (3-tier confidence scoring) - SSE real-time logging via ExtractionEventBus - Schema-driven ExtractionDrawer (dynamic field rendering from template) - Excel wide-table export with flattenModuleData normalization - M2 integration test suite Critical Fixes (data normalization): - DynamicPromptBuilder: explicit flat key-value output format with example - ExtractionExcelExporter: handle both array and flat data formats - ExtractionDrawer: schema-driven rendering instead of hardcoded fields - ExtractionValidator: array-format quote verification support - SSE route: Fastify register encapsulation to bypass auth for EventSource - LLM JSON sanitizer: strip illegal control chars before JSON.parse Also includes: RVW stats verification spec, SSA expert config guide Tested: M1 pipeline test + M2 HITL test + manual frontend verification Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-25 18:29:20 +08:00
parent 371fa53956
commit f0736dbca1
40 changed files with 6138 additions and 48 deletions
--- a/backend/src/modules/asl/extraction/services/ExtractionValidator.ts
+++ b/backend/src/modules/asl/extraction/services/ExtractionValidator.ts
@@ -0,0 +1,166 @@
+/**
+ * 提取结果验证器 — fuzzyQuoteMatch 三级置信度
+ *
+ * 对 LLM 返回的每个字段，检查其附带的 quote 是否能在原文中找到匹配。
+ * 返回三级置信度：high / medium / low
+ *
+ * 搜索范围 = MinerU HTML (html-to-text 剥离标签) + 全文 Markdown 拼接
+ */
+
+import { logger } from '../../../../common/logging/index.js';
+
+interface QuoteVerificationEntry {
+  confidence: 'high' | 'medium' | 'low';
+  quote: string;
+  matchScore: number;
+}
+
+type QuoteVerificationResult = Record<string, Record<string, QuoteVerificationEntry>>;
+
+class ExtractionValidatorImpl {
+  /**
+   * 构建搜索范围文本：MinerU HTML 纯文本 + 全文 Markdown
+   */
+  buildQuoteSearchScope(fullMarkdown: string, tableHtmls: string[]): string {
+    const parts: string[] = [];
+
+    for (const html of tableHtmls) {
+      parts.push(this.htmlToPlainText(html));
+    }
+
+    parts.push(fullMarkdown);
+
+    return parts.join('\n');
+  }
+
+  /**
+   * 验证 extractedData 中所有字段的 quote 置信度
+   * 兼容数组格式 [{key, value, quote}] 和扁平格式 {field: value, field_quote: "..."}
+   */
+  verifyAllQuotes(
+    extractedData: Record<string, any>,
+    searchScope: string,
+  ): QuoteVerificationResult {
+    const result: QuoteVerificationResult = {};
+    const normalizedScope = this.normalize(searchScope);
+
+    for (const [module, fields] of Object.entries(extractedData)) {
+      if (typeof fields !== 'object' || fields === null) continue;
+      result[module] = {};
+
+      if (Array.isArray(fields)) {
+        for (const item of fields) {
+          if (typeof item !== 'object' || !item || !item.key) continue;
+          const quote = item.quote;
+          if (!quote || typeof quote !== 'string') continue;
+          const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
+          result[module][item.key] = entry;
+        }
+      } else {
+        for (const [key, value] of Object.entries(fields)) {
+          if (key.endsWith('_quote')) continue;
+
+          // Check for nested {value, quote} object
+          if (typeof value === 'object' && value !== null && 'quote' in value) {
+            const quote = (value as any).quote;
+            if (quote && typeof quote === 'string') {
+              const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
+              result[module][key] = entry;
+            }
+            continue;
+          }
+
+          const quoteKey = `${key}_quote`;
+          const quote = fields[quoteKey];
+          if (!quote || typeof quote !== 'string') continue;
+          const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
+          result[module][key] = entry;
+        }
+      }
+    }
+
+    return result;
+  }
+
+  /**
+   * 核心算法：fuzzyQuoteMatch
+   *
+   * 1. 精确子串匹配 → high (score = 1.0)
+   * 2. 忽略空白/标点后子串匹配 → high (score = 0.95)
+   * 3. 关键词覆盖率 ≥ 80% → medium
+   * 4. 关键词覆盖率 ≥ 50% → medium (lower score)
+   * 5. 覆盖率 < 50% → low
+   */
+  fuzzyQuoteMatch(
+    rawScope: string,
+    normalizedScope: string,
+    llmQuote: string,
+  ): QuoteVerificationEntry {
+    const trimmedQuote = llmQuote.trim();
+    if (trimmedQuote.length < 3) {
+      return { confidence: 'low', quote: trimmedQuote, matchScore: 0 };
+    }
+
+    // Exact substring match
+    if (rawScope.includes(trimmedQuote)) {
+      return { confidence: 'high', quote: trimmedQuote, matchScore: 1.0 };
+    }
+
+    // Normalized substring match (collapse whitespace, remove punctuation)
+    const normalizedQuote = this.normalize(trimmedQuote);
+    if (normalizedScope.includes(normalizedQuote)) {
+      return { confidence: 'high', quote: trimmedQuote, matchScore: 0.95 };
+    }
+
+    // Keyword overlap
+    const quoteTokens = this.tokenize(trimmedQuote);
+    if (quoteTokens.length === 0) {
+      return { confidence: 'low', quote: trimmedQuote, matchScore: 0 };
+    }
+
+    const matchedTokens = quoteTokens.filter((t) => normalizedScope.includes(t));
+    const coverage = matchedTokens.length / quoteTokens.length;
+
+    if (coverage >= 0.8) {
+      return { confidence: 'medium', quote: trimmedQuote, matchScore: coverage };
+    }
+    if (coverage >= 0.5) {
+      return { confidence: 'medium', quote: trimmedQuote, matchScore: coverage };
+    }
+
+    return { confidence: 'low', quote: trimmedQuote, matchScore: coverage };
+  }
+
+  private normalize(text: string): string {
+    return text
+      .toLowerCase()
+      .replace(/[\s\u00A0]+/g, ' ')
+      .replace(/[^\w\s\u4e00-\u9fff]/g, '')
+      .trim();
+  }
+
+  private tokenize(text: string): string[] {
+    return this.normalize(text)
+      .split(/\s+/)
+      .filter((t) => t.length >= 2);
+  }
+
+  /**
+   * 简易 HTML → 纯文本（不引入 html-to-text 依赖）
+   */
+  private htmlToPlainText(html: string): string {
+    return html
+      .replace(/<br\s*\/?>/gi, '\n')
+      .replace(/<\/?(tr|td|th|thead|tbody|table|div|p|li|ul|ol|h[1-6])[^>]*>/gi, '\n')
+      .replace(/<[^>]+>/g, '')
+      .replace(/&nbsp;/g, ' ')
+      .replace(/&amp;/g, '&')
+      .replace(/&lt;/g, '<')
+      .replace(/&gt;/g, '>')
+      .replace(/&quot;/g, '"')
+      .replace(/\n{3,}/g, '\n\n')
+      .trim();
+  }
+}
+
+export const extractionValidator = new ExtractionValidatorImpl();