Files
AIclinicalresearch/backend/src/modules/asl/extraction/services/ExtractionValidator.ts
HaHafeng f0736dbca1 feat(asl/extraction): Complete Tool 3 M1+M2 - skeleton pipeline and HITL workbench
M1 Skeleton Pipeline:
- Scatter-dispatch + Aggregator polling pattern (PgBoss)
- PKB ACL bridge (PkbBridgeService -> PkbExportService DTOs)
- ExtractionSingleWorker with DeepSeek-V3 LLM extraction
- PermanentExtractionError for non-retryable failures
- Phantom Retry Guard (idempotent worker)
- 3-step minimal frontend (Setup -> Progress -> Workbench)
- 4 new DB tables (extraction_templates, project_templates, tasks, results)
- 3 system templates seed (RCT, Cohort, QC)
- M1 integration test suite

M2 HITL Workbench:
- MinerU VLM integration for high-fidelity table extraction
- XML-isolated DynamicPromptBuilder with flat JSON output template
- fuzzyQuoteMatch validator (3-tier confidence scoring)
- SSE real-time logging via ExtractionEventBus
- Schema-driven ExtractionDrawer (dynamic field rendering from template)
- Excel wide-table export with flattenModuleData normalization
- M2 integration test suite

Critical Fixes (data normalization):
- DynamicPromptBuilder: explicit flat key-value output format with example
- ExtractionExcelExporter: handle both array and flat data formats
- ExtractionDrawer: schema-driven rendering instead of hardcoded fields
- ExtractionValidator: array-format quote verification support
- SSE route: Fastify register encapsulation to bypass auth for EventSource
- LLM JSON sanitizer: strip illegal control chars before JSON.parse

Also includes: RVW stats verification spec, SSA expert config guide

Tested: M1 pipeline test + M2 HITL test + manual frontend verification
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-25 18:29:20 +08:00

167 lines
5.2 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* 提取结果验证器 — fuzzyQuoteMatch 三级置信度
*
* 对 LLM 返回的每个字段,检查其附带的 quote 是否能在原文中找到匹配。
* 返回三级置信度high / medium / low
*
* 搜索范围 = MinerU HTML (html-to-text 剥离标签) + 全文 Markdown 拼接
*/
import { logger } from '../../../../common/logging/index.js';
interface QuoteVerificationEntry {
confidence: 'high' | 'medium' | 'low';
quote: string;
matchScore: number;
}
type QuoteVerificationResult = Record<string, Record<string, QuoteVerificationEntry>>;
class ExtractionValidatorImpl {
/**
* 构建搜索范围文本MinerU HTML 纯文本 + 全文 Markdown
*/
buildQuoteSearchScope(fullMarkdown: string, tableHtmls: string[]): string {
const parts: string[] = [];
for (const html of tableHtmls) {
parts.push(this.htmlToPlainText(html));
}
parts.push(fullMarkdown);
return parts.join('\n');
}
/**
* 验证 extractedData 中所有字段的 quote 置信度
* 兼容数组格式 [{key, value, quote}] 和扁平格式 {field: value, field_quote: "..."}
*/
verifyAllQuotes(
extractedData: Record<string, any>,
searchScope: string,
): QuoteVerificationResult {
const result: QuoteVerificationResult = {};
const normalizedScope = this.normalize(searchScope);
for (const [module, fields] of Object.entries(extractedData)) {
if (typeof fields !== 'object' || fields === null) continue;
result[module] = {};
if (Array.isArray(fields)) {
for (const item of fields) {
if (typeof item !== 'object' || !item || !item.key) continue;
const quote = item.quote;
if (!quote || typeof quote !== 'string') continue;
const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
result[module][item.key] = entry;
}
} else {
for (const [key, value] of Object.entries(fields)) {
if (key.endsWith('_quote')) continue;
// Check for nested {value, quote} object
if (typeof value === 'object' && value !== null && 'quote' in value) {
const quote = (value as any).quote;
if (quote && typeof quote === 'string') {
const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
result[module][key] = entry;
}
continue;
}
const quoteKey = `${key}_quote`;
const quote = fields[quoteKey];
if (!quote || typeof quote !== 'string') continue;
const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
result[module][key] = entry;
}
}
}
return result;
}
/**
* 核心算法fuzzyQuoteMatch
*
* 1. 精确子串匹配 → high (score = 1.0)
* 2. 忽略空白/标点后子串匹配 → high (score = 0.95)
* 3. 关键词覆盖率 ≥ 80% → medium
* 4. 关键词覆盖率 ≥ 50% → medium (lower score)
* 5. 覆盖率 < 50% → low
*/
fuzzyQuoteMatch(
rawScope: string,
normalizedScope: string,
llmQuote: string,
): QuoteVerificationEntry {
const trimmedQuote = llmQuote.trim();
if (trimmedQuote.length < 3) {
return { confidence: 'low', quote: trimmedQuote, matchScore: 0 };
}
// Exact substring match
if (rawScope.includes(trimmedQuote)) {
return { confidence: 'high', quote: trimmedQuote, matchScore: 1.0 };
}
// Normalized substring match (collapse whitespace, remove punctuation)
const normalizedQuote = this.normalize(trimmedQuote);
if (normalizedScope.includes(normalizedQuote)) {
return { confidence: 'high', quote: trimmedQuote, matchScore: 0.95 };
}
// Keyword overlap
const quoteTokens = this.tokenize(trimmedQuote);
if (quoteTokens.length === 0) {
return { confidence: 'low', quote: trimmedQuote, matchScore: 0 };
}
const matchedTokens = quoteTokens.filter((t) => normalizedScope.includes(t));
const coverage = matchedTokens.length / quoteTokens.length;
if (coverage >= 0.8) {
return { confidence: 'medium', quote: trimmedQuote, matchScore: coverage };
}
if (coverage >= 0.5) {
return { confidence: 'medium', quote: trimmedQuote, matchScore: coverage };
}
return { confidence: 'low', quote: trimmedQuote, matchScore: coverage };
}
private normalize(text: string): string {
return text
.toLowerCase()
.replace(/[\s\u00A0]+/g, ' ')
.replace(/[^\w\s\u4e00-\u9fff]/g, '')
.trim();
}
private tokenize(text: string): string[] {
return this.normalize(text)
.split(/\s+/)
.filter((t) => t.length >= 2);
}
/**
* 简易 HTML → 纯文本(不引入 html-to-text 依赖)
*/
private htmlToPlainText(html: string): string {
return html
.replace(/<br\s*\/?>/gi, '\n')
.replace(/<\/?(tr|td|th|thead|tbody|table|div|p|li|ul|ol|h[1-6])[^>]*>/gi, '\n')
.replace(/<[^>]+>/g, '')
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/\n{3,}/g, '\n\n')
.trim();
}
}
export const extractionValidator = new ExtractionValidatorImpl();