AIclinicalresearch/backend/src/modules/asl/extraction/services/ExtractionValidator.ts

/**
 * 提取结果验证器 — fuzzyQuoteMatch 三级置信度
 *
 * 对 LLM 返回的每个字段，检查其附带的 quote 是否能在原文中找到匹配。
 * 返回三级置信度：high / medium / low
 *
 * 搜索范围 = MinerU HTML (html-to-text 剥离标签) + 全文 Markdown 拼接
 */

import { logger } from '../../../../common/logging/index.js';

interface QuoteVerificationEntry {
  confidence: 'high' | 'medium' | 'low';
  quote: string;
  matchScore: number;
}

type QuoteVerificationResult = Record<string, Record<string, QuoteVerificationEntry>>;

class ExtractionValidatorImpl {
  /**
   * 构建搜索范围文本：MinerU HTML 纯文本 + 全文 Markdown
   */
  buildQuoteSearchScope(fullMarkdown: string, tableHtmls: string[]): string {
    const parts: string[] = [];

    for (const html of tableHtmls) {
      parts.push(this.htmlToPlainText(html));
    }

    parts.push(fullMarkdown);

    return parts.join('\n');
  }

  /**
   * 验证 extractedData 中所有字段的 quote 置信度
   * 兼容数组格式 [{key, value, quote}] 和扁平格式 {field: value, field_quote: "..."}
   */
  verifyAllQuotes(
    extractedData: Record<string, any>,
    searchScope: string,
  ): QuoteVerificationResult {
    const result: QuoteVerificationResult = {};
    const normalizedScope = this.normalize(searchScope);

    for (const [module, fields] of Object.entries(extractedData)) {
      if (typeof fields !== 'object' || fields === null) continue;
      result[module] = {};

      if (Array.isArray(fields)) {
        for (const item of fields) {
          if (typeof item !== 'object' || !item || !item.key) continue;
          const quote = item.quote;
          if (!quote || typeof quote !== 'string') continue;
          const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
          result[module][item.key] = entry;
        }
      } else {
        for (const [key, value] of Object.entries(fields)) {
          if (key.endsWith('_quote')) continue;

          // Check for nested {value, quote} object
          if (typeof value === 'object' && value !== null && 'quote' in value) {
            const quote = (value as any).quote;
            if (quote && typeof quote === 'string') {
              const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
              result[module][key] = entry;
            }
            continue;
          }

          const quoteKey = `${key}_quote`;
          const quote = fields[quoteKey];
          if (!quote || typeof quote !== 'string') continue;
          const entry = this.fuzzyQuoteMatch(searchScope, normalizedScope, quote);
          result[module][key] = entry;
        }
      }
    }

    return result;
  }

  /**
   * 核心算法：fuzzyQuoteMatch
   *
   * 1. 精确子串匹配 → high (score = 1.0)
   * 2. 忽略空白/标点后子串匹配 → high (score = 0.95)
   * 3. 关键词覆盖率 ≥ 80% → medium
   * 4. 关键词覆盖率 ≥ 50% → medium (lower score)
   * 5. 覆盖率 < 50% → low
   */
  fuzzyQuoteMatch(
    rawScope: string,
    normalizedScope: string,
    llmQuote: string,
  ): QuoteVerificationEntry {
    const trimmedQuote = llmQuote.trim();
    if (trimmedQuote.length < 3) {
      return { confidence: 'low', quote: trimmedQuote, matchScore: 0 };
    }

    // Exact substring match
    if (rawScope.includes(trimmedQuote)) {
      return { confidence: 'high', quote: trimmedQuote, matchScore: 1.0 };
    }

    // Normalized substring match (collapse whitespace, remove punctuation)
    const normalizedQuote = this.normalize(trimmedQuote);
    if (normalizedScope.includes(normalizedQuote)) {
      return { confidence: 'high', quote: trimmedQuote, matchScore: 0.95 };
    }

    // Keyword overlap
    const quoteTokens = this.tokenize(trimmedQuote);
    if (quoteTokens.length === 0) {
      return { confidence: 'low', quote: trimmedQuote, matchScore: 0 };
    }

    const matchedTokens = quoteTokens.filter((t) => normalizedScope.includes(t));
    const coverage = matchedTokens.length / quoteTokens.length;

    if (coverage >= 0.8) {
      return { confidence: 'medium', quote: trimmedQuote, matchScore: coverage };
    }
    if (coverage >= 0.5) {
      return { confidence: 'medium', quote: trimmedQuote, matchScore: coverage };
    }

    return { confidence: 'low', quote: trimmedQuote, matchScore: coverage };
  }

  private normalize(text: string): string {
    return text
      .toLowerCase()
      .replace(/[\s\u00A0]+/g, ' ')
      .replace(/[^\w\s\u4e00-\u9fff]/g, '')
      .trim();
  }

  private tokenize(text: string): string[] {
    return this.normalize(text)
      .split(/\s+/)
      .filter((t) => t.length >= 2);
  }

  /**
   * 简易 HTML → 纯文本（不引入 html-to-text 依赖）
   */
  private htmlToPlainText(html: string): string {
    return html
      .replace(/<br\s*\/?>/gi, '\n')
      .replace(/<\/?(tr|td|th|thead|tbody|table|div|p|li|ul|ol|h[1-6])[^>]*>/gi, '\n')
      .replace(/<[^>]+>/g, '')
      .replace(/&nbsp;/g, ' ')
      .replace(/&amp;/g, '&')
      .replace(/&lt;/g, '<')
      .replace(/&gt;/g, '>')
      .replace(/&quot;/g, '"')
      .replace(/\n{3,}/g, '\n\n')
      .trim();
  }
}

export const extractionValidator = new ExtractionValidatorImpl();