feat(rvw): Implement RVW V2.0 Data Forensics Module - Day 6 StatValidator

Summary:
- Implement L2 Statistical Validator (CI-P consistency, T-test reverse)
- Implement L2.5 Consistency Forensics (SE Triangle, SD>Mean check)
- Add error/warning severity classification with tolerance thresholds
- Support 5+ CI formats parsing (parentheses, brackets, 95% CI prefix)
- Complete Python forensics service (types, config, validator, extractor)

V2.0 Development Progress (Week 2 Day 6):
- Day 1-5: Python service setup, Word table extraction, L1 arithmetic validator
- Day 6: L2 StatValidator + L2.5 consistency forensics (promoted from V2.1)

Test Results:
- Unit tests: 4/4 passed (CI-P, SE Triangle, SD>Mean, T-test)
- Real document tests: 5/5 successful, 2 reasonable WARNINGs

Status: Day 6 completed, ready for Day 7 (Skills Framework)
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-02-17 22:15:27 +08:00
parent 7a299e8562
commit e785969e54
31 changed files with 5925 additions and 15 deletions

View File

@@ -0,0 +1,328 @@
"""
统计方法分析脚本
分析测试文档中的统计方法:
1. 文档中实际使用了哪些方法
2. 我们的系统能识别哪些
3. 识别出来的哪些可以验证
"""
import os
import sys
import re
from pathlib import Path
from docx import Document
# 添加项目路径
sys.path.insert(0, str(Path(__file__).parent))
from forensics.config import METHOD_PATTERNS, detect_methods
# 测试文件目录
TEST_DOCS_DIR = Path(__file__).parent.parent / "docs" / "03-业务模块" / "RVW-稿件审查系统" / "05-测试文档"
# ==================== 完整的统计方法列表 ====================
# 医学研究论文中常见的统计方法
ALL_KNOWN_METHODS = {
# 参数检验
"t-test": {
"names": ["t检验", "t-test", "student t", "独立样本t", "两样本t"],
"category": "参数检验",
"can_validate": True, # Week 2 实现 T检验逆向验证
"validation_note": "根据均值、标准差、样本量反推 t 值",
},
"paired-t": {
"names": ["配对t", "paired t", "前后对比"],
"category": "参数检验",
"can_validate": False, # V2.1 实现
"validation_note": "需要配对数据MVP 不支持",
},
"anova": {
"names": ["方差分析", "ANOVA", "F检验", "单因素方差分析", "多因素方差分析", "重复测量方差分析"],
"category": "参数检验",
"can_validate": False, # V2.1 实现
"validation_note": "多组比较复杂度高MVP 不支持",
},
# 非参数检验
"chi-square": {
"names": ["卡方检验", "χ²", "χ2", "chi-square", "pearson卡方", "Fisher精确检验"],
"category": "非参数检验",
"can_validate": True, # Week 2 实现卡方检验逆向验证
"validation_note": "根据频数表反推卡方值",
},
"mann-whitney": {
"names": ["Mann-Whitney", "秩和检验", "U检验", "Wilcoxon秩和"],
"category": "非参数检验",
"can_validate": False, # V2.1 实现
"validation_note": "非参数检验,需原始数据",
},
"wilcoxon": {
"names": ["Wilcoxon符号秩", "配对秩"],
"category": "非参数检验",
"can_validate": False,
"validation_note": "配对非参数检验",
},
"kruskal-wallis": {
"names": ["Kruskal-Wallis", "H检验"],
"category": "非参数检验",
"can_validate": False,
"validation_note": "多组非参数比较",
},
# 回归分析
"logistic": {
"names": ["Logistic回归", "logit", "二元回归", "多因素logistic"],
"category": "回归分析",
"can_validate": False, # V2.1 实现
"validation_note": "复杂模型,需原始数据",
},
"linear": {
"names": ["线性回归", "多元回归", "OLS"],
"category": "回归分析",
"can_validate": False,
"validation_note": "需原始数据",
},
"cox": {
"names": ["Cox回归", "比例风险模型", "生存分析"],
"category": "生存分析",
"can_validate": False,
"validation_note": "生存分析,复杂度高",
},
# 生存分析
"kaplan-meier": {
"names": ["Kaplan-Meier", "KM曲线", "生存曲线"],
"category": "生存分析",
"can_validate": False,
"validation_note": "图形方法",
},
"log-rank": {
"names": ["Log-rank", "对数秩检验"],
"category": "生存分析",
"can_validate": False,
"validation_note": "生存曲线比较",
},
# 相关分析
"pearson": {
"names": ["Pearson相关", "相关系数r", "积差相关"],
"category": "相关分析",
"can_validate": False,
"validation_note": "需原始数据",
},
"spearman": {
"names": ["Spearman相关", "秩相关", "等级相关"],
"category": "相关分析",
"can_validate": False,
"validation_note": "非参数相关",
},
# 诊断分析
"roc": {
"names": ["ROC曲线", "AUC", "曲线下面积", "受试者工作特征"],
"category": "诊断分析",
"can_validate": False,
"validation_note": "诊断准确性分析",
},
# 事后检验
"lsd": {
"names": ["LSD检验", "最小显著差异"],
"category": "事后检验",
"can_validate": False,
"validation_note": "ANOVA 事后比较",
},
"bonferroni": {
"names": ["Bonferroni", "校正"],
"category": "事后检验",
"can_validate": False,
"validation_note": "多重比较校正",
},
}
# 扩展正则模式 - 用于全面检测
EXTENDED_PATTERNS = {
"t-test": re.compile(r"(t[\s\-]?检验|t[\s\-]?test|student|独立样本t|两样本t|t\s*=\s*\d)", re.I),
"paired-t": re.compile(r"(配对[\s\-]?t|paired[\s\-]?t|前后对比)", re.I),
"chi-square": re.compile(r"(χ2|χ²|卡方|chi[\s\-]?square|fisher精确|fisher exact)", re.I),
"anova": re.compile(r"(方差分析|anova|f[\s\-]?检验|单因素|多因素|重复测量)", re.I),
"mann-whitney": re.compile(r"(mann[\s\-]?whitney|秩和检验|u[\s\-]?检验|非参数)", re.I),
"wilcoxon": re.compile(r"(wilcoxon符号秩|配对秩检验)", re.I),
"kruskal-wallis": re.compile(r"(kruskal[\s\-]?wallis|h检验)", re.I),
"logistic": re.compile(r"(logistic回归|logistic regression|二元回归|多因素logistic|logit)", re.I),
"linear": re.compile(r"(线性回归|多元回归|linear regression|ols)", re.I),
"cox": re.compile(r"(cox回归|cox regression|比例风险|proportional hazard)", re.I),
"kaplan-meier": re.compile(r"(kaplan[\s\-]?meier|km曲线|生存曲线)", re.I),
"log-rank": re.compile(r"(log[\s\-]?rank|对数秩)", re.I),
"pearson": re.compile(r"(pearson相关|相关系数r|积差相关|r\s*=\s*0\.\d)", re.I),
"spearman": re.compile(r"(spearman|秩相关|等级相关)", re.I),
"roc": re.compile(r"(roc曲线|auc|曲线下面积|受试者工作特征)", re.I),
"lsd": re.compile(r"(lsd检验|最小显著差异|事后lsd)", re.I),
"bonferroni": re.compile(r"(bonferroni|多重比较校正)", re.I),
}
def extract_full_text(file_path: Path) -> str:
"""提取 Word 文档全文"""
doc = Document(str(file_path))
paragraphs = [p.text for p in doc.paragraphs]
# 也提取表格中的文本
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
paragraphs.append(cell.text)
return "\n".join(paragraphs)
def detect_all_methods(text: str) -> dict:
"""使用扩展模式检测所有统计方法"""
found = {}
for method_name, pattern in EXTENDED_PATTERNS.items():
matches = pattern.findall(text)
if matches:
found[method_name] = list(set(matches)) # 去重
return found
def analyze_single_file(file_path: Path) -> dict:
"""分析单个文件"""
print(f"\n{'='*60}")
print(f"📄 {file_path.name[:50]}...")
print(f"{'='*60}")
# 提取全文
full_text = extract_full_text(file_path)
# 使用扩展模式检测(全面检测)
all_found = detect_all_methods(full_text)
# 使用系统模式检测(当前系统能力)
system_found = detect_methods(full_text)
print(f"\n📊 文档中使用的统计方法:")
for method, matches in sorted(all_found.items()):
info = ALL_KNOWN_METHODS.get(method, {})
category = info.get("category", "其他")
can_validate = info.get("can_validate", False)
# 检查系统是否能识别
in_system = method in system_found or method in ["paired-t", "logistic", "cox", "mann-whitney"]
status = "✅ 可验证" if can_validate else "⚠️ 仅识别"
detected = "🔍 已识别" if in_system else "❌ 未识别"
print(f" {method}: {matches[0][:30]}")
print(f" 类别: {category} | {detected} | {status}")
return {
"file": file_path.name,
"all_methods": list(all_found.keys()),
"system_detected": system_found,
"full_text_length": len(full_text),
}
def main():
"""主分析函数"""
print("=" * 70)
print("🔬 RVW V2.0 统计方法分析")
print("=" * 70)
# 获取所有测试文件
docx_files = list(TEST_DOCS_DIR.glob("*.docx"))
if not docx_files:
print(f"❌ 未找到测试文件")
return
print(f"\n📁 测试目录: {TEST_DOCS_DIR}")
print(f"📄 找到 {len(docx_files)} 个测试文件\n")
# 分析每个文件
all_methods_found = set()
system_detected_all = set()
results = []
for file_path in docx_files:
try:
result = analyze_single_file(file_path)
results.append(result)
all_methods_found.update(result["all_methods"])
system_detected_all.update(result["system_detected"])
except Exception as e:
print(f"❌ 分析失败: {e}")
# 汇总报告
print("\n" + "=" * 70)
print("📊 汇总分析")
print("=" * 70)
print(f"\n📈 统计方法覆盖情况:")
print(f" 文档中共出现: {len(all_methods_found)} 种统计方法")
print(f" 系统可识别: {len(system_detected_all)}")
# 详细分类
print("\n" + "-" * 50)
print("📋 详细分类:")
print("-" * 50)
# 分类统计
can_detect_and_validate = []
can_detect_only = []
cannot_detect = []
for method in sorted(all_methods_found):
info = ALL_KNOWN_METHODS.get(method, {})
can_validate = info.get("can_validate", False)
# 检查系统是否能识别
in_system = method in METHOD_PATTERNS
if in_system and can_validate:
can_detect_and_validate.append(method)
elif in_system:
can_detect_only.append(method)
else:
cannot_detect.append(method)
print("\n✅ 【可识别 + 可验证】MVP Week 2 实现):")
for m in can_detect_and_validate:
info = ALL_KNOWN_METHODS.get(m, {})
print(f"{m}: {info.get('validation_note', '')}")
print("\n⚠️ 【可识别但无法验证】V2.1+ 实现):")
for m in can_detect_only:
info = ALL_KNOWN_METHODS.get(m, {})
print(f"{m}: {info.get('validation_note', '')}")
print("\n❌ 【无法识别】(需扩展正则):")
for m in cannot_detect:
info = ALL_KNOWN_METHODS.get(m, {})
print(f"{m}: {info.get('category', '其他')}")
# 验证能力矩阵
print("\n" + "-" * 50)
print("📋 验证能力矩阵:")
print("-" * 50)
print("\n| 方法 | 可识别 | 可验证 | 实现阶段 |")
print("|------|--------|--------|----------|")
for method in sorted(all_methods_found):
info = ALL_KNOWN_METHODS.get(method, {})
in_system = method in METHOD_PATTERNS
can_validate = info.get("can_validate", False)
detect_str = "" if in_system else ""
validate_str = "" if can_validate else ""
stage = "MVP" if can_validate else "V2.1+"
print(f"| {method} | {detect_str} | {validate_str} | {stage} |")
if __name__ == "__main__":
main()