feat(asl/extraction): Complete Tool 3 M1+M2 - skeleton pipeline and HITL workbench
M1 Skeleton Pipeline: - Scatter-dispatch + Aggregator polling pattern (PgBoss) - PKB ACL bridge (PkbBridgeService -> PkbExportService DTOs) - ExtractionSingleWorker with DeepSeek-V3 LLM extraction - PermanentExtractionError for non-retryable failures - Phantom Retry Guard (idempotent worker) - 3-step minimal frontend (Setup -> Progress -> Workbench) - 4 new DB tables (extraction_templates, project_templates, tasks, results) - 3 system templates seed (RCT, Cohort, QC) - M1 integration test suite M2 HITL Workbench: - MinerU VLM integration for high-fidelity table extraction - XML-isolated DynamicPromptBuilder with flat JSON output template - fuzzyQuoteMatch validator (3-tier confidence scoring) - SSE real-time logging via ExtractionEventBus - Schema-driven ExtractionDrawer (dynamic field rendering from template) - Excel wide-table export with flattenModuleData normalization - M2 integration test suite Critical Fixes (data normalization): - DynamicPromptBuilder: explicit flat key-value output format with example - ExtractionExcelExporter: handle both array and flat data formats - ExtractionDrawer: schema-driven rendering instead of hardcoded fields - ExtractionValidator: array-format quote verification support - SSE route: Fastify register encapsulation to bypass auth for EventSource - LLM JSON sanitizer: strip illegal control chars before JSON.parse Also includes: RVW stats verification spec, SSA expert config guide Tested: M1 pipeline test + M2 HITL test + manual frontend verification Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -0,0 +1,166 @@
|
||||
/**
|
||||
* 提取结果验证器 — fuzzyQuoteMatch 三级置信度
|
||||
*
|
||||
* 对 LLM 返回的每个字段,检查其附带的 quote 是否能在原文中找到匹配。
|
||||
* 返回三级置信度:high / medium / low
|
||||
*
|
||||
* 搜索范围 = MinerU HTML (html-to-text 剥离标签) + 全文 Markdown 拼接
|
||||
*/
|
||||
|
||||
import { logger } from '../../../../common/logging/index.js';
|
||||
|
||||
interface QuoteVerificationEntry {
|
||||
confidence: 'high' | 'medium' | 'low';
|
||||
quote: string;
|
||||
matchScore: number;
|
||||
}
|
||||
|
||||
type QuoteVerificationResult = Record<string, Record<string, QuoteVerificationEntry>>;
|
||||
|
||||
class ExtractionValidatorImpl {
|
||||
/**
|
||||
* 构建搜索范围文本:MinerU HTML 纯文本 + 全文 Markdown
|
||||
*/
|
||||
buildQuoteSearchScope(fullMarkdown: string, tableHtmls: string[]): string {
|
||||
const parts: string[] = [];
|
||||
|
||||
for (const html of tableHtmls) {
|
||||
parts.push(this.htmlToPlainText(html));
|
||||
}
|
||||
|
||||
parts.push(fullMarkdown);
|
||||
|
||||
return parts.join('\n');
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证 extractedData 中所有字段的 quote 置信度
|
||||
* 兼容数组格式 [{key, value, quote}] 和扁平格式 {field: value, field_quote: "..."}
|
||||
*/
|
||||
verifyAllQuotes(
|
||||
extractedData: Record<string, any>,
|
||||
searchScope: string,
|
||||
): QuoteVerificationResult {
|
||||
const result: QuoteVerificationResult = {};
|
||||
const normalizedScope = this.normalize(searchScope);
|
||||
|
||||
for (const [module, fields] of Object.entries(extractedData)) {
|
||||
if (typeof fields !== 'object' || fields === null) continue;
|
||||
result[module] = {};
|
||||
|
||||
if (Array.isArray(fields)) {
|
||||
for (const item of fields) {
|
||||
if (typeof item !== 'object' || !item || !item.key) continue;
|
||||
const quote = item.quote;
|
||||
if (!quote || typeof quote !== 'string') continue;
|
||||
const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
|
||||
result[module][item.key] = entry;
|
||||
}
|
||||
} else {
|
||||
for (const [key, value] of Object.entries(fields)) {
|
||||
if (key.endsWith('_quote')) continue;
|
||||
|
||||
// Check for nested {value, quote} object
|
||||
if (typeof value === 'object' && value !== null && 'quote' in value) {
|
||||
const quote = (value as any).quote;
|
||||
if (quote && typeof quote === 'string') {
|
||||
const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
|
||||
result[module][key] = entry;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
const quoteKey = `${key}_quote`;
|
||||
const quote = fields[quoteKey];
|
||||
if (!quote || typeof quote !== 'string') continue;
|
||||
const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
|
||||
result[module][key] = entry;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 核心算法:fuzzyQuoteMatch
|
||||
*
|
||||
* 1. 精确子串匹配 → high (score = 1.0)
|
||||
* 2. 忽略空白/标点后子串匹配 → high (score = 0.95)
|
||||
* 3. 关键词覆盖率 ≥ 80% → medium
|
||||
* 4. 关键词覆盖率 ≥ 50% → medium (lower score)
|
||||
* 5. 覆盖率 < 50% → low
|
||||
*/
|
||||
fuzzyQuoteMatch(
|
||||
rawScope: string,
|
||||
normalizedScope: string,
|
||||
llmQuote: string,
|
||||
): QuoteVerificationEntry {
|
||||
const trimmedQuote = llmQuote.trim();
|
||||
if (trimmedQuote.length < 3) {
|
||||
return { confidence: 'low', quote: trimmedQuote, matchScore: 0 };
|
||||
}
|
||||
|
||||
// Exact substring match
|
||||
if (rawScope.includes(trimmedQuote)) {
|
||||
return { confidence: 'high', quote: trimmedQuote, matchScore: 1.0 };
|
||||
}
|
||||
|
||||
// Normalized substring match (collapse whitespace, remove punctuation)
|
||||
const normalizedQuote = this.normalize(trimmedQuote);
|
||||
if (normalizedScope.includes(normalizedQuote)) {
|
||||
return { confidence: 'high', quote: trimmedQuote, matchScore: 0.95 };
|
||||
}
|
||||
|
||||
// Keyword overlap
|
||||
const quoteTokens = this.tokenize(trimmedQuote);
|
||||
if (quoteTokens.length === 0) {
|
||||
return { confidence: 'low', quote: trimmedQuote, matchScore: 0 };
|
||||
}
|
||||
|
||||
const matchedTokens = quoteTokens.filter((t) => normalizedScope.includes(t));
|
||||
const coverage = matchedTokens.length / quoteTokens.length;
|
||||
|
||||
if (coverage >= 0.8) {
|
||||
return { confidence: 'medium', quote: trimmedQuote, matchScore: coverage };
|
||||
}
|
||||
if (coverage >= 0.5) {
|
||||
return { confidence: 'medium', quote: trimmedQuote, matchScore: coverage };
|
||||
}
|
||||
|
||||
return { confidence: 'low', quote: trimmedQuote, matchScore: coverage };
|
||||
}
|
||||
|
||||
private normalize(text: string): string {
|
||||
return text
|
||||
.toLowerCase()
|
||||
.replace(/[\s\u00A0]+/g, ' ')
|
||||
.replace(/[^\w\s\u4e00-\u9fff]/g, '')
|
||||
.trim();
|
||||
}
|
||||
|
||||
private tokenize(text: string): string[] {
|
||||
return this.normalize(text)
|
||||
.split(/\s+/)
|
||||
.filter((t) => t.length >= 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* 简易 HTML → 纯文本(不引入 html-to-text 依赖)
|
||||
*/
|
||||
private htmlToPlainText(html: string): string {
|
||||
return html
|
||||
.replace(/<br\s*\/?>/gi, '\n')
|
||||
.replace(/<\/?(tr|td|th|thead|tbody|table|div|p|li|ul|ol|h[1-6])[^>]*>/gi, '\n')
|
||||
.replace(/<[^>]+>/g, '')
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/\n{3,}/g, '\n\n')
|
||||
.trim();
|
||||
}
|
||||
}
|
||||
|
||||
export const extractionValidator = new ExtractionValidatorImpl();
|
||||
Reference in New Issue
Block a user