feat(rvw): Implement RVW V2.0 Data Forensics Module - Day 6 StatValidator

Summary: - Implement L2 Statistical Validator (CI-P consistency, T-test reverse) - Implement L2.5 Consistency Forensics (SE Triangle, SD>Mean check) - Add error/warning severity classification with tolerance thresholds - Support 5+ CI formats parsing (parentheses, brackets, 95% CI prefix) - Complete Python forensics service (types, config, validator, extractor) V2.0 Development Progress (Week 2 Day 6): - Day 1-5: Python service setup, Word table extraction, L1 arithmetic validator - Day 6: L2 StatValidator + L2.5 consistency forensics (promoted from V2.1) Test Results: - Unit tests: 4/4 passed (CI-P, SE Triangle, SD>Mean, T-test) - Real document tests: 5/5 successful, 2 reasonable WARNINGs Status: Day 6 completed, ready for Day 7 (Skills Framework) Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-17 22:15:27 +08:00
parent 7a299e8562
commit e785969e54
31 changed files with 5925 additions and 15 deletions
--- a/extraction_service/analyze_methods.py
+++ b/extraction_service/analyze_methods.py
@@ -0,0 +1,328 @@
+"""
+统计方法分析脚本
+
+分析测试文档中的统计方法：
+1. 文档中实际使用了哪些方法
+2. 我们的系统能识别哪些
+3. 识别出来的哪些可以验证
+"""
+
+import os
+import sys
+import re
+from pathlib import Path
+from docx import Document
+
+# 添加项目路径
+sys.path.insert(0, str(Path(__file__).parent))
+
+from forensics.config import METHOD_PATTERNS, detect_methods
+
+# 测试文件目录
+TEST_DOCS_DIR = Path(__file__).parent.parent / "docs" / "03-业务模块" / "RVW-稿件审查系统" / "05-测试文档"
+
+
+# ==================== 完整的统计方法列表 ====================
+# 医学研究论文中常见的统计方法
+
+ALL_KNOWN_METHODS = {
+    # 参数检验
+    "t-test": {
+        "names": ["t检验", "t-test", "student t", "独立样本t", "两样本t"],
+        "category": "参数检验",
+        "can_validate": True,  # Week 2 实现 T检验逆向验证
+        "validation_note": "根据均值、标准差、样本量反推 t 值",
+    },
+    "paired-t": {
+        "names": ["配对t", "paired t", "前后对比"],
+        "category": "参数检验", 
+        "can_validate": False,  # V2.1 实现
+        "validation_note": "需要配对数据，MVP 不支持",
+    },
+    "anova": {
+        "names": ["方差分析", "ANOVA", "F检验", "单因素方差分析", "多因素方差分析", "重复测量方差分析"],
+        "category": "参数检验",
+        "can_validate": False,  # V2.1 实现
+        "validation_note": "多组比较，复杂度高，MVP 不支持",
+    },
+    
+    # 非参数检验
+    "chi-square": {
+        "names": ["卡方检验", "χ²", "χ2", "chi-square", "pearson卡方", "Fisher精确检验"],
+        "category": "非参数检验",
+        "can_validate": True,  # Week 2 实现卡方检验逆向验证
+        "validation_note": "根据频数表反推卡方值",
+    },
+    "mann-whitney": {
+        "names": ["Mann-Whitney", "秩和检验", "U检验", "Wilcoxon秩和"],
+        "category": "非参数检验",
+        "can_validate": False,  # V2.1 实现
+        "validation_note": "非参数检验，需原始数据",
+    },
+    "wilcoxon": {
+        "names": ["Wilcoxon符号秩", "配对秩"],
+        "category": "非参数检验",
+        "can_validate": False,
+        "validation_note": "配对非参数检验",
+    },
+    "kruskal-wallis": {
+        "names": ["Kruskal-Wallis", "H检验"],
+        "category": "非参数检验",
+        "can_validate": False,
+        "validation_note": "多组非参数比较",
+    },
+    
+    # 回归分析
+    "logistic": {
+        "names": ["Logistic回归", "logit", "二元回归", "多因素logistic"],
+        "category": "回归分析",
+        "can_validate": False,  # V2.1 实现
+        "validation_note": "复杂模型，需原始数据",
+    },
+    "linear": {
+        "names": ["线性回归", "多元回归", "OLS"],
+        "category": "回归分析",
+        "can_validate": False,
+        "validation_note": "需原始数据",
+    },
+    "cox": {
+        "names": ["Cox回归", "比例风险模型", "生存分析"],
+        "category": "生存分析",
+        "can_validate": False,
+        "validation_note": "生存分析，复杂度高",
+    },
+    
+    # 生存分析
+    "kaplan-meier": {
+        "names": ["Kaplan-Meier", "KM曲线", "生存曲线"],
+        "category": "生存分析",
+        "can_validate": False,
+        "validation_note": "图形方法",
+    },
+    "log-rank": {
+        "names": ["Log-rank", "对数秩检验"],
+        "category": "生存分析",
+        "can_validate": False,
+        "validation_note": "生存曲线比较",
+    },
+    
+    # 相关分析
+    "pearson": {
+        "names": ["Pearson相关", "相关系数r", "积差相关"],
+        "category": "相关分析",
+        "can_validate": False,
+        "validation_note": "需原始数据",
+    },
+    "spearman": {
+        "names": ["Spearman相关", "秩相关", "等级相关"],
+        "category": "相关分析",
+        "can_validate": False,
+        "validation_note": "非参数相关",
+    },
+    
+    # 诊断分析
+    "roc": {
+        "names": ["ROC曲线", "AUC", "曲线下面积", "受试者工作特征"],
+        "category": "诊断分析",
+        "can_validate": False,
+        "validation_note": "诊断准确性分析",
+    },
+    
+    # 事后检验
+    "lsd": {
+        "names": ["LSD检验", "最小显著差异"],
+        "category": "事后检验",
+        "can_validate": False,
+        "validation_note": "ANOVA 事后比较",
+    },
+    "bonferroni": {
+        "names": ["Bonferroni", "校正"],
+        "category": "事后检验",
+        "can_validate": False,
+        "validation_note": "多重比较校正",
+    },
+}
+
+# 扩展正则模式 - 用于全面检测
+EXTENDED_PATTERNS = {
+    "t-test": re.compile(r"(t[\s\-]?检验|t[\s\-]?test|student|独立样本t|两样本t|t\s*=\s*\d)", re.I),
+    "paired-t": re.compile(r"(配对[\s\-]?t|paired[\s\-]?t|前后对比)", re.I),
+    "chi-square": re.compile(r"(χ2|χ²|卡方|chi[\s\-]?square|fisher精确|fisher exact)", re.I),
+    "anova": re.compile(r"(方差分析|anova|f[\s\-]?检验|单因素|多因素|重复测量)", re.I),
+    "mann-whitney": re.compile(r"(mann[\s\-]?whitney|秩和检验|u[\s\-]?检验|非参数)", re.I),
+    "wilcoxon": re.compile(r"(wilcoxon符号秩|配对秩检验)", re.I),
+    "kruskal-wallis": re.compile(r"(kruskal[\s\-]?wallis|h检验)", re.I),
+    "logistic": re.compile(r"(logistic回归|logistic regression|二元回归|多因素logistic|logit)", re.I),
+    "linear": re.compile(r"(线性回归|多元回归|linear regression|ols)", re.I),
+    "cox": re.compile(r"(cox回归|cox regression|比例风险|proportional hazard)", re.I),
+    "kaplan-meier": re.compile(r"(kaplan[\s\-]?meier|km曲线|生存曲线)", re.I),
+    "log-rank": re.compile(r"(log[\s\-]?rank|对数秩)", re.I),
+    "pearson": re.compile(r"(pearson相关|相关系数r|积差相关|r\s*=\s*0\.\d)", re.I),
+    "spearman": re.compile(r"(spearman|秩相关|等级相关)", re.I),
+    "roc": re.compile(r"(roc曲线|auc|曲线下面积|受试者工作特征)", re.I),
+    "lsd": re.compile(r"(lsd检验|最小显著差异|事后lsd)", re.I),
+    "bonferroni": re.compile(r"(bonferroni|多重比较校正)", re.I),
+}
+
+
+def extract_full_text(file_path: Path) -> str:
+    """提取 Word 文档全文"""
+    doc = Document(str(file_path))
+    paragraphs = [p.text for p in doc.paragraphs]
+    
+    # 也提取表格中的文本
+    for table in doc.tables:
+        for row in table.rows:
+            for cell in row.cells:
+                paragraphs.append(cell.text)
+    
+    return "\n".join(paragraphs)
+
+
+def detect_all_methods(text: str) -> dict:
+    """使用扩展模式检测所有统计方法"""
+    found = {}
+    for method_name, pattern in EXTENDED_PATTERNS.items():
+        matches = pattern.findall(text)
+        if matches:
+            found[method_name] = list(set(matches))  # 去重
+    return found
+
+
+def analyze_single_file(file_path: Path) -> dict:
+    """分析单个文件"""
+    print(f"\n{'='*60}")
+    print(f"📄 {file_path.name[:50]}...")
+    print(f"{'='*60}")
+    
+    # 提取全文
+    full_text = extract_full_text(file_path)
+    
+    # 使用扩展模式检测（全面检测）
+    all_found = detect_all_methods(full_text)
+    
+    # 使用系统模式检测（当前系统能力）
+    system_found = detect_methods(full_text)
+    
+    print(f"\n📊 文档中使用的统计方法:")
+    for method, matches in sorted(all_found.items()):
+        info = ALL_KNOWN_METHODS.get(method, {})
+        category = info.get("category", "其他")
+        can_validate = info.get("can_validate", False)
+        
+        # 检查系统是否能识别
+        in_system = method in system_found or method in ["paired-t", "logistic", "cox", "mann-whitney"]
+        
+        status = "✅ 可验证" if can_validate else "⚠️ 仅识别"
+        detected = "🔍 已识别" if in_system else "❌ 未识别"
+        
+        print(f"   {method}: {matches[0][:30]}")
+        print(f"      类别: {category} | {detected} | {status}")
+    
+    return {
+        "file": file_path.name,
+        "all_methods": list(all_found.keys()),
+        "system_detected": system_found,
+        "full_text_length": len(full_text),
+    }
+
+
+def main():
+    """主分析函数"""
+    print("=" * 70)
+    print("🔬 RVW V2.0 统计方法分析")
+    print("=" * 70)
+    
+    # 获取所有测试文件
+    docx_files = list(TEST_DOCS_DIR.glob("*.docx"))
+    
+    if not docx_files:
+        print(f"❌ 未找到测试文件")
+        return
+    
+    print(f"\n📁 测试目录: {TEST_DOCS_DIR}")
+    print(f"📄 找到 {len(docx_files)} 个测试文件\n")
+    
+    # 分析每个文件
+    all_methods_found = set()
+    system_detected_all = set()
+    results = []
+    
+    for file_path in docx_files:
+        try:
+            result = analyze_single_file(file_path)
+            results.append(result)
+            all_methods_found.update(result["all_methods"])
+            system_detected_all.update(result["system_detected"])
+        except Exception as e:
+            print(f"❌ 分析失败: {e}")
+    
+    # 汇总报告
+    print("\n" + "=" * 70)
+    print("📊 汇总分析")
+    print("=" * 70)
+    
+    print(f"\n📈 统计方法覆盖情况:")
+    print(f"   文档中共出现: {len(all_methods_found)} 种统计方法")
+    print(f"   系统可识别:   {len(system_detected_all)} 种")
+    
+    # 详细分类
+    print("\n" + "-" * 50)
+    print("📋 详细分类:")
+    print("-" * 50)
+    
+    # 分类统计
+    can_detect_and_validate = []
+    can_detect_only = []
+    cannot_detect = []
+    
+    for method in sorted(all_methods_found):
+        info = ALL_KNOWN_METHODS.get(method, {})
+        can_validate = info.get("can_validate", False)
+        
+        # 检查系统是否能识别
+        in_system = method in METHOD_PATTERNS
+        
+        if in_system and can_validate:
+            can_detect_and_validate.append(method)
+        elif in_system:
+            can_detect_only.append(method)
+        else:
+            cannot_detect.append(method)
+    
+    print("\n✅ 【可识别 + 可验证】（MVP Week 2 实现）:")
+    for m in can_detect_and_validate:
+        info = ALL_KNOWN_METHODS.get(m, {})
+        print(f"   • {m}: {info.get('validation_note', '')}")
+    
+    print("\n⚠️ 【可识别，但无法验证】（V2.1+ 实现）:")
+    for m in can_detect_only:
+        info = ALL_KNOWN_METHODS.get(m, {})
+        print(f"   • {m}: {info.get('validation_note', '')}")
+    
+    print("\n❌ 【无法识别】（需扩展正则）:")
+    for m in cannot_detect:
+        info = ALL_KNOWN_METHODS.get(m, {})
+        print(f"   • {m}: {info.get('category', '其他')}")
+    
+    # 验证能力矩阵
+    print("\n" + "-" * 50)
+    print("📋 验证能力矩阵:")
+    print("-" * 50)
+    print("\n| 方法 | 可识别 | 可验证 | 实现阶段 |")
+    print("|------|--------|--------|----------|")
+    
+    for method in sorted(all_methods_found):
+        info = ALL_KNOWN_METHODS.get(method, {})
+        in_system = method in METHOD_PATTERNS
+        can_validate = info.get("can_validate", False)
+        
+        detect_str = "✅" if in_system else "❌"
+        validate_str = "✅" if can_validate else "❌"
+        stage = "MVP" if can_validate else "V2.1+"
+        
+        print(f"| {method} | {detect_str} | {validate_str} | {stage} |")
+
+
+if __name__ == "__main__":
+    main()