M1 Skeleton Pipeline: - Scatter-dispatch + Aggregator polling pattern (PgBoss) - PKB ACL bridge (PkbBridgeService -> PkbExportService DTOs) - ExtractionSingleWorker with DeepSeek-V3 LLM extraction - PermanentExtractionError for non-retryable failures - Phantom Retry Guard (idempotent worker) - 3-step minimal frontend (Setup -> Progress -> Workbench) - 4 new DB tables (extraction_templates, project_templates, tasks, results) - 3 system templates seed (RCT, Cohort, QC) - M1 integration test suite M2 HITL Workbench: - MinerU VLM integration for high-fidelity table extraction - XML-isolated DynamicPromptBuilder with flat JSON output template - fuzzyQuoteMatch validator (3-tier confidence scoring) - SSE real-time logging via ExtractionEventBus - Schema-driven ExtractionDrawer (dynamic field rendering from template) - Excel wide-table export with flattenModuleData normalization - M2 integration test suite Critical Fixes (data normalization): - DynamicPromptBuilder: explicit flat key-value output format with example - ExtractionExcelExporter: handle both array and flat data formats - ExtractionDrawer: schema-driven rendering instead of hardcoded fields - ExtractionValidator: array-format quote verification support - SSE route: Fastify register encapsulation to bypass auth for EventSource - LLM JSON sanitizer: strip illegal control chars before JSON.parse Also includes: RVW stats verification spec, SSA expert config guide Tested: M1 pipeline test + M2 HITL test + manual frontend verification Co-authored-by: Cursor <cursoragent@cursor.com>
167 lines
5.2 KiB
TypeScript
167 lines
5.2 KiB
TypeScript
/**
|
||
* 提取结果验证器 — fuzzyQuoteMatch 三级置信度
|
||
*
|
||
* 对 LLM 返回的每个字段,检查其附带的 quote 是否能在原文中找到匹配。
|
||
* 返回三级置信度:high / medium / low
|
||
*
|
||
* 搜索范围 = MinerU HTML (html-to-text 剥离标签) + 全文 Markdown 拼接
|
||
*/
|
||
|
||
import { logger } from '../../../../common/logging/index.js';
|
||
|
||
interface QuoteVerificationEntry {
|
||
confidence: 'high' | 'medium' | 'low';
|
||
quote: string;
|
||
matchScore: number;
|
||
}
|
||
|
||
type QuoteVerificationResult = Record<string, Record<string, QuoteVerificationEntry>>;
|
||
|
||
class ExtractionValidatorImpl {
|
||
/**
|
||
* 构建搜索范围文本:MinerU HTML 纯文本 + 全文 Markdown
|
||
*/
|
||
buildQuoteSearchScope(fullMarkdown: string, tableHtmls: string[]): string {
|
||
const parts: string[] = [];
|
||
|
||
for (const html of tableHtmls) {
|
||
parts.push(this.htmlToPlainText(html));
|
||
}
|
||
|
||
parts.push(fullMarkdown);
|
||
|
||
return parts.join('\n');
|
||
}
|
||
|
||
/**
|
||
* 验证 extractedData 中所有字段的 quote 置信度
|
||
* 兼容数组格式 [{key, value, quote}] 和扁平格式 {field: value, field_quote: "..."}
|
||
*/
|
||
verifyAllQuotes(
|
||
extractedData: Record<string, any>,
|
||
searchScope: string,
|
||
): QuoteVerificationResult {
|
||
const result: QuoteVerificationResult = {};
|
||
const normalizedScope = this.normalize(searchScope);
|
||
|
||
for (const [module, fields] of Object.entries(extractedData)) {
|
||
if (typeof fields !== 'object' || fields === null) continue;
|
||
result[module] = {};
|
||
|
||
if (Array.isArray(fields)) {
|
||
for (const item of fields) {
|
||
if (typeof item !== 'object' || !item || !item.key) continue;
|
||
const quote = item.quote;
|
||
if (!quote || typeof quote !== 'string') continue;
|
||
const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
|
||
result[module][item.key] = entry;
|
||
}
|
||
} else {
|
||
for (const [key, value] of Object.entries(fields)) {
|
||
if (key.endsWith('_quote')) continue;
|
||
|
||
// Check for nested {value, quote} object
|
||
if (typeof value === 'object' && value !== null && 'quote' in value) {
|
||
const quote = (value as any).quote;
|
||
if (quote && typeof quote === 'string') {
|
||
const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
|
||
result[module][key] = entry;
|
||
}
|
||
continue;
|
||
}
|
||
|
||
const quoteKey = `${key}_quote`;
|
||
const quote = fields[quoteKey];
|
||
if (!quote || typeof quote !== 'string') continue;
|
||
const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
|
||
result[module][key] = entry;
|
||
}
|
||
}
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
/**
|
||
* 核心算法:fuzzyQuoteMatch
|
||
*
|
||
* 1. 精确子串匹配 → high (score = 1.0)
|
||
* 2. 忽略空白/标点后子串匹配 → high (score = 0.95)
|
||
* 3. 关键词覆盖率 ≥ 80% → medium
|
||
* 4. 关键词覆盖率 ≥ 50% → medium (lower score)
|
||
* 5. 覆盖率 < 50% → low
|
||
*/
|
||
fuzzyQuoteMatch(
|
||
rawScope: string,
|
||
normalizedScope: string,
|
||
llmQuote: string,
|
||
): QuoteVerificationEntry {
|
||
const trimmedQuote = llmQuote.trim();
|
||
if (trimmedQuote.length < 3) {
|
||
return { confidence: 'low', quote: trimmedQuote, matchScore: 0 };
|
||
}
|
||
|
||
// Exact substring match
|
||
if (rawScope.includes(trimmedQuote)) {
|
||
return { confidence: 'high', quote: trimmedQuote, matchScore: 1.0 };
|
||
}
|
||
|
||
// Normalized substring match (collapse whitespace, remove punctuation)
|
||
const normalizedQuote = this.normalize(trimmedQuote);
|
||
if (normalizedScope.includes(normalizedQuote)) {
|
||
return { confidence: 'high', quote: trimmedQuote, matchScore: 0.95 };
|
||
}
|
||
|
||
// Keyword overlap
|
||
const quoteTokens = this.tokenize(trimmedQuote);
|
||
if (quoteTokens.length === 0) {
|
||
return { confidence: 'low', quote: trimmedQuote, matchScore: 0 };
|
||
}
|
||
|
||
const matchedTokens = quoteTokens.filter((t) => normalizedScope.includes(t));
|
||
const coverage = matchedTokens.length / quoteTokens.length;
|
||
|
||
if (coverage >= 0.8) {
|
||
return { confidence: 'medium', quote: trimmedQuote, matchScore: coverage };
|
||
}
|
||
if (coverage >= 0.5) {
|
||
return { confidence: 'medium', quote: trimmedQuote, matchScore: coverage };
|
||
}
|
||
|
||
return { confidence: 'low', quote: trimmedQuote, matchScore: coverage };
|
||
}
|
||
|
||
private normalize(text: string): string {
|
||
return text
|
||
.toLowerCase()
|
||
.replace(/[\s\u00A0]+/g, ' ')
|
||
.replace(/[^\w\s\u4e00-\u9fff]/g, '')
|
||
.trim();
|
||
}
|
||
|
||
private tokenize(text: string): string[] {
|
||
return this.normalize(text)
|
||
.split(/\s+/)
|
||
.filter((t) => t.length >= 2);
|
||
}
|
||
|
||
/**
|
||
* 简易 HTML → 纯文本(不引入 html-to-text 依赖)
|
||
*/
|
||
private htmlToPlainText(html: string): string {
|
||
return html
|
||
.replace(/<br\s*\/?>/gi, '\n')
|
||
.replace(/<\/?(tr|td|th|thead|tbody|table|div|p|li|ul|ol|h[1-6])[^>]*>/gi, '\n')
|
||
.replace(/<[^>]+>/g, '')
|
||
.replace(/ /g, ' ')
|
||
.replace(/&/g, '&')
|
||
.replace(/</g, '<')
|
||
.replace(/>/g, '>')
|
||
.replace(/"/g, '"')
|
||
.replace(/\n{3,}/g, '\n\n')
|
||
.trim();
|
||
}
|
||
}
|
||
|
||
export const extractionValidator = new ExtractionValidatorImpl();
|