feat(rvw): Implement RVW V2.0 Data Forensics Module - Day 6 StatValidator

Summary: - Implement L2 Statistical Validator (CI-P consistency, T-test reverse) - Implement L2.5 Consistency Forensics (SE Triangle, SD>Mean check) - Add error/warning severity classification with tolerance thresholds - Support 5+ CI formats parsing (parentheses, brackets, 95% CI prefix) - Complete Python forensics service (types, config, validator, extractor) V2.0 Development Progress (Week 2 Day 6): - Day 1-5: Python service setup, Word table extraction, L1 arithmetic validator - Day 6: L2 StatValidator + L2.5 consistency forensics (promoted from V2.1) Test Results: - Unit tests: 4/4 passed (CI-P, SE Triangle, SD>Mean, T-test) - Real document tests: 5/5 successful, 2 reasonable WARNINGs Status: Day 6 completed, ready for Day 7 (Skills Framework) Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-17 22:15:27 +08:00
parent 7a299e8562
commit e785969e54
31 changed files with 5925 additions and 15 deletions
--- a/extraction_service/forensics/config.py
+++ b/extraction_service/forensics/config.py
@@ -0,0 +1,182 @@
+"""
+数据侦探模块 - 配置和常量
+
+包含文件限制、正则表达式、默认配置等。
+"""
+
+import re
+from typing import Dict, Pattern
+
+# ==================== 文件限制 ====================
+
+MAX_FILE_SIZE_MB = 20       # 最大文件大小（MB）
+MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
+
+MAX_TABLE_ROWS = 500        # 单表最大行数
+MAX_TABLES_PER_DOC = 50     # 单文档最大表格数
+
+ALLOWED_EXTENSIONS = {".docx"}  # MVP 仅支持 .docx
+
+
+# ==================== 正则表达式 ====================
+
+# n (%) 格式匹配，如 "45 (50.0%)" 或 "45(50%)"
+PERCENT_PATTERN = re.compile(
+    r"(\d+(?:\.\d+)?)\s*\(\s*(\d+(?:\.\d+)?)\s*%?\s*\)",
+    re.IGNORECASE
+)
+
+# P 值匹配，如 "P=0.05" 或 "p < 0.001" 或 "P值=0.05"
+PVALUE_PATTERN = re.compile(
+    r"[Pp][\s\-值]*[=<>≤≥]\s*(\d+\.?\d*)",
+    re.IGNORECASE
+)
+
+# 置信区间匹配，如 "95% CI: 1.2-2.5" 或 "(1.2, 2.5)"
+CI_PATTERN = re.compile(
+    r"(?:95%?\s*CI[:\s]*)?[\(\[]?\s*(\d+\.?\d*)\s*[-–,]\s*(\d+\.?\d*)\s*[\)\]]?",
+    re.IGNORECASE
+)
+
+# OR/HR/RR 匹配
+EFFECT_SIZE_PATTERN = re.compile(
+    r"(?:OR|HR|RR)\s*[=:]\s*(\d+\.?\d*)",
+    re.IGNORECASE
+)
+
+
+# ==================== 统计方法检测 ====================
+
+METHOD_PATTERNS: Dict[str, Pattern] = {
+    "t-test": re.compile(
+        r"(t[\s\-]?test|t[\s\-]?检验|student.*test|independent.*sample|独立样本|两样本)",
+        re.IGNORECASE
+    ),
+    "chi-square": re.compile(
+        r"(chi[\s\-]?square|χ2|χ²|卡方|pearson.*chi|fisher.*exact|fisher精确)",
+        re.IGNORECASE
+    ),
+    "anova": re.compile(
+        r"(anova|analysis\s+of\s+variance|方差分析|单因素|多因素)",
+        re.IGNORECASE
+    ),
+    "logistic": re.compile(
+        r"(logistic\s+regression|逻辑回归|二元回归|logit)",
+        re.IGNORECASE
+    ),
+    "cox": re.compile(
+        r"(cox\s+regression|cox\s+proportional|生存分析|比例风险|kaplan[\s\-]?meier)",
+        re.IGNORECASE
+    ),
+    "mann-whitney": re.compile(
+        r"(mann[\s\-]?whitney|wilcoxon|秩和检验|非参数)",
+        re.IGNORECASE
+    ),
+    "paired-t": re.compile(
+        r"(paired[\s\-]?t|配对.*t|before[\s\-]?after)",
+        re.IGNORECASE
+    ),
+}
+
+
+# ==================== 表格类型检测 ====================
+
+# 基线特征表关键词
+BASELINE_KEYWORDS = [
+    "baseline", "characteristics", "demographic", "基线", "特征", "人口学"
+]
+
+# 结局表关键词
+OUTCOME_KEYWORDS = [
+    "outcome", "result", "efficacy", "endpoint", "结局", "疗效", "终点"
+]
+
+
+# ==================== 容错配置（终审建议） ====================
+
+DEFAULT_TOLERANCE_PERCENT = 0.1  # 百分比容错 ±0.1%
+
+# P 值容错阈值
+PVALUE_ERROR_THRESHOLD = 0.05    # P 值差异 > 0.05 → Error（严重矛盾）
+PVALUE_WARNING_THRESHOLD = 0.01  # P 值差异 > 0.01 → Warning（可能舍入误差）
+PVALUE_RELATIVE_TOLERANCE = 0.05 # P 值相对误差 ±5%
+
+# CI 容错阈值
+CI_RELATIVE_TOLERANCE = 0.02     # CI 端点相对误差 ±2%
+
+# 统计量容错
+STAT_RELATIVE_TOLERANCE = 0.05   # t/χ² 值相对误差 ±5%
+
+
+# ==================== Mean±SD 正则表达式 ====================
+
+# Mean ± SD 格式，如 "45.2 ± 12.3" 或 "45.2±12.3" 或 "45.2 (12.3)"
+MEAN_SD_PATTERN = re.compile(
+    r"(\d+\.?\d*)\s*[±\+\-]\s*(\d+\.?\d*)",
+    re.IGNORECASE
+)
+
+# 带括号的 SD 格式，如 "45.2 (12.3)" - 用于某些表格
+MEAN_SD_PAREN_PATTERN = re.compile(
+    r"(\d+\.?\d*)\s*\(\s*(\d+\.?\d*)\s*\)(?!\s*%)",  # 排除百分比格式
+    re.IGNORECASE
+)
+
+# CI 格式清洗器（终审建议：处理多种分隔符）
+CI_PATTERNS = [
+    # 标准格式: 2.5 (1.1-3.5) 或 2.5 [1.1-3.5]
+    re.compile(r"[\(\[]\s*(\d+\.?\d*)\s*[-–—,;]\s*(\d+\.?\d*)\s*[\)\]]", re.IGNORECASE),
+    # 带 CI 标签: 95% CI: 1.1-3.5 或 95%CI 1.1 to 3.5
+    re.compile(r"95%?\s*CI\s*[:\s]+(\d+\.?\d*)\s*[-–—,;to]+\s*(\d+\.?\d*)", re.IGNORECASE),
+    # 简单范围: 1.1-3.5（需要上下文判断）
+    re.compile(r"(\d+\.?\d*)\s*[-–—]\s*(\d+\.?\d*)", re.IGNORECASE),
+]
+
+
+# ==================== 验证函数 ====================
+
+def validate_file_size(size_bytes: int) -> tuple[bool, str]:
+    """
+    验证文件大小
+    
+    Returns:
+        (is_valid, error_message)
+    """
+    if size_bytes > MAX_FILE_SIZE_BYTES:
+        return False, f"文件大小 ({size_bytes / 1024 / 1024:.1f}MB) 超过限制 ({MAX_FILE_SIZE_MB}MB)"
+    return True, ""
+
+
+def validate_file_extension(filename: str) -> tuple[bool, str]:
+    """
+    验证文件扩展名
+    
+    Returns:
+        (is_valid, error_message)
+    """
+    from pathlib import Path
+    ext = Path(filename).suffix.lower()
+    
+    if ext not in ALLOWED_EXTENSIONS:
+        if ext == ".doc":
+            return False, "暂不支持 .doc 格式，请使用 Word 另存为 .docx 格式后重新上传"
+        return False, f"不支持的文件格式: {ext}，仅支持 .docx"
+    
+    return True, ""
+
+
+def detect_methods(text: str) -> list[str]:
+    """
+    检测文本中的统计方法（正则优先）
+    
+    Args:
+        text: 文档全文
+        
+    Returns:
+        检测到的方法列表
+    """
+    found = []
+    for method_name, pattern in METHOD_PATTERNS.items():
+        if pattern.search(text):
+            found.append(method_name)
+    return found