feat(asl/extraction): Complete Tool 3 M1+M2 - skeleton pipeline and HITL workbench

M1 Skeleton Pipeline:
- Scatter-dispatch + Aggregator polling pattern (PgBoss)
- PKB ACL bridge (PkbBridgeService -> PkbExportService DTOs)
- ExtractionSingleWorker with DeepSeek-V3 LLM extraction
- PermanentExtractionError for non-retryable failures
- Phantom Retry Guard (idempotent worker)
- 3-step minimal frontend (Setup -> Progress -> Workbench)
- 4 new DB tables (extraction_templates, project_templates, tasks, results)
- 3 system templates seed (RCT, Cohort, QC)
- M1 integration test suite

M2 HITL Workbench:
- MinerU VLM integration for high-fidelity table extraction
- XML-isolated DynamicPromptBuilder with flat JSON output template
- fuzzyQuoteMatch validator (3-tier confidence scoring)
- SSE real-time logging via ExtractionEventBus
- Schema-driven ExtractionDrawer (dynamic field rendering from template)
- Excel wide-table export with flattenModuleData normalization
- M2 integration test suite

Critical Fixes (data normalization):
- DynamicPromptBuilder: explicit flat key-value output format with example
- ExtractionExcelExporter: handle both array and flat data formats
- ExtractionDrawer: schema-driven rendering instead of hardcoded fields
- ExtractionValidator: array-format quote verification support
- SSE route: Fastify register encapsulation to bypass auth for EventSource
- LLM JSON sanitizer: strip illegal control chars before JSON.parse

Also includes: RVW stats verification spec, SSA expert config guide

Tested: M1 pipeline test + M2 HITL test + manual frontend verification
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-02-25 18:29:20 +08:00
parent 371fa53956
commit f0736dbca1
40 changed files with 6138 additions and 48 deletions

View File

@@ -0,0 +1,166 @@
/**
* 提取结果验证器 — fuzzyQuoteMatch 三级置信度
*
* 对 LLM 返回的每个字段,检查其附带的 quote 是否能在原文中找到匹配。
* 返回三级置信度high / medium / low
*
* 搜索范围 = MinerU HTML (html-to-text 剥离标签) + 全文 Markdown 拼接
*/
import { logger } from '../../../../common/logging/index.js';
interface QuoteVerificationEntry {
confidence: 'high' | 'medium' | 'low';
quote: string;
matchScore: number;
}
type QuoteVerificationResult = Record<string, Record<string, QuoteVerificationEntry>>;
class ExtractionValidatorImpl {
/**
* 构建搜索范围文本MinerU HTML 纯文本 + 全文 Markdown
*/
buildQuoteSearchScope(fullMarkdown: string, tableHtmls: string[]): string {
const parts: string[] = [];
for (const html of tableHtmls) {
parts.push(this.htmlToPlainText(html));
}
parts.push(fullMarkdown);
return parts.join('\n');
}
/**
* 验证 extractedData 中所有字段的 quote 置信度
* 兼容数组格式 [{key, value, quote}] 和扁平格式 {field: value, field_quote: "..."}
*/
verifyAllQuotes(
extractedData: Record<string, any>,
searchScope: string,
): QuoteVerificationResult {
const result: QuoteVerificationResult = {};
const normalizedScope = this.normalize(searchScope);
for (const [module, fields] of Object.entries(extractedData)) {
if (typeof fields !== 'object' || fields === null) continue;
result[module] = {};
if (Array.isArray(fields)) {
for (const item of fields) {
if (typeof item !== 'object' || !item || !item.key) continue;
const quote = item.quote;
if (!quote || typeof quote !== 'string') continue;
const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
result[module][item.key] = entry;
}
} else {
for (const [key, value] of Object.entries(fields)) {
if (key.endsWith('_quote')) continue;
// Check for nested {value, quote} object
if (typeof value === 'object' && value !== null && 'quote' in value) {
const quote = (value as any).quote;
if (quote && typeof quote === 'string') {
const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
result[module][key] = entry;
}
continue;
}
const quoteKey = `${key}_quote`;
const quote = fields[quoteKey];
if (!quote || typeof quote !== 'string') continue;
const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
result[module][key] = entry;
}
}
}
return result;
}
/**
* 核心算法fuzzyQuoteMatch
*
* 1. 精确子串匹配 → high (score = 1.0)
* 2. 忽略空白/标点后子串匹配 → high (score = 0.95)
* 3. 关键词覆盖率 ≥ 80% → medium
* 4. 关键词覆盖率 ≥ 50% → medium (lower score)
* 5. 覆盖率 < 50% → low
*/
fuzzyQuoteMatch(
rawScope: string,
normalizedScope: string,
llmQuote: string,
): QuoteVerificationEntry {
const trimmedQuote = llmQuote.trim();
if (trimmedQuote.length < 3) {
return { confidence: 'low', quote: trimmedQuote, matchScore: 0 };
}
// Exact substring match
if (rawScope.includes(trimmedQuote)) {
return { confidence: 'high', quote: trimmedQuote, matchScore: 1.0 };
}
// Normalized substring match (collapse whitespace, remove punctuation)
const normalizedQuote = this.normalize(trimmedQuote);
if (normalizedScope.includes(normalizedQuote)) {
return { confidence: 'high', quote: trimmedQuote, matchScore: 0.95 };
}
// Keyword overlap
const quoteTokens = this.tokenize(trimmedQuote);
if (quoteTokens.length === 0) {
return { confidence: 'low', quote: trimmedQuote, matchScore: 0 };
}
const matchedTokens = quoteTokens.filter((t) => normalizedScope.includes(t));
const coverage = matchedTokens.length / quoteTokens.length;
if (coverage >= 0.8) {
return { confidence: 'medium', quote: trimmedQuote, matchScore: coverage };
}
if (coverage >= 0.5) {
return { confidence: 'medium', quote: trimmedQuote, matchScore: coverage };
}
return { confidence: 'low', quote: trimmedQuote, matchScore: coverage };
}
private normalize(text: string): string {
return text
.toLowerCase()
.replace(/[\s\u00A0]+/g, ' ')
.replace(/[^\w\s\u4e00-\u9fff]/g, '')
.trim();
}
private tokenize(text: string): string[] {
return this.normalize(text)
.split(/\s+/)
.filter((t) => t.length >= 2);
}
/**
* 简易 HTML → 纯文本(不引入 html-to-text 依赖)
*/
private htmlToPlainText(html: string): string {
return html
.replace(/<br\s*\/?>/gi, '\n')
.replace(/<\/?(tr|td|th|thead|tbody|table|div|p|li|ul|ol|h[1-6])[^>]*>/gi, '\n')
.replace(/<[^>]+>/g, '')
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/\n{3,}/g, '\n\n')
.trim();
}
}
export const extractionValidator = new ExtractionValidatorImpl();