Files
AIclinicalresearch/extraction_service/analyze_methods.py
HaHafeng e785969e54 feat(rvw): Implement RVW V2.0 Data Forensics Module - Day 6 StatValidator
Summary:
- Implement L2 Statistical Validator (CI-P consistency, T-test reverse)
- Implement L2.5 Consistency Forensics (SE Triangle, SD>Mean check)
- Add error/warning severity classification with tolerance thresholds
- Support 5+ CI formats parsing (parentheses, brackets, 95% CI prefix)
- Complete Python forensics service (types, config, validator, extractor)

V2.0 Development Progress (Week 2 Day 6):
- Day 1-5: Python service setup, Word table extraction, L1 arithmetic validator
- Day 6: L2 StatValidator + L2.5 consistency forensics (promoted from V2.1)

Test Results:
- Unit tests: 4/4 passed (CI-P, SE Triangle, SD>Mean, T-test)
- Real document tests: 5/5 successful, 2 reasonable WARNINGs

Status: Day 6 completed, ready for Day 7 (Skills Framework)
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-17 22:15:27 +08:00

329 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
统计方法分析脚本
分析测试文档中的统计方法:
1. 文档中实际使用了哪些方法
2. 我们的系统能识别哪些
3. 识别出来的哪些可以验证
"""
import os
import sys
import re
from pathlib import Path
from docx import Document
# 添加项目路径
sys.path.insert(0, str(Path(__file__).parent))
from forensics.config import METHOD_PATTERNS, detect_methods
# 测试文件目录
TEST_DOCS_DIR = Path(__file__).parent.parent / "docs" / "03-业务模块" / "RVW-稿件审查系统" / "05-测试文档"
# ==================== 完整的统计方法列表 ====================
# 医学研究论文中常见的统计方法
ALL_KNOWN_METHODS = {
# 参数检验
"t-test": {
"names": ["t检验", "t-test", "student t", "独立样本t", "两样本t"],
"category": "参数检验",
"can_validate": True, # Week 2 实现 T检验逆向验证
"validation_note": "根据均值、标准差、样本量反推 t 值",
},
"paired-t": {
"names": ["配对t", "paired t", "前后对比"],
"category": "参数检验",
"can_validate": False, # V2.1 实现
"validation_note": "需要配对数据MVP 不支持",
},
"anova": {
"names": ["方差分析", "ANOVA", "F检验", "单因素方差分析", "多因素方差分析", "重复测量方差分析"],
"category": "参数检验",
"can_validate": False, # V2.1 实现
"validation_note": "多组比较复杂度高MVP 不支持",
},
# 非参数检验
"chi-square": {
"names": ["卡方检验", "χ²", "χ2", "chi-square", "pearson卡方", "Fisher精确检验"],
"category": "非参数检验",
"can_validate": True, # Week 2 实现卡方检验逆向验证
"validation_note": "根据频数表反推卡方值",
},
"mann-whitney": {
"names": ["Mann-Whitney", "秩和检验", "U检验", "Wilcoxon秩和"],
"category": "非参数检验",
"can_validate": False, # V2.1 实现
"validation_note": "非参数检验,需原始数据",
},
"wilcoxon": {
"names": ["Wilcoxon符号秩", "配对秩"],
"category": "非参数检验",
"can_validate": False,
"validation_note": "配对非参数检验",
},
"kruskal-wallis": {
"names": ["Kruskal-Wallis", "H检验"],
"category": "非参数检验",
"can_validate": False,
"validation_note": "多组非参数比较",
},
# 回归分析
"logistic": {
"names": ["Logistic回归", "logit", "二元回归", "多因素logistic"],
"category": "回归分析",
"can_validate": False, # V2.1 实现
"validation_note": "复杂模型,需原始数据",
},
"linear": {
"names": ["线性回归", "多元回归", "OLS"],
"category": "回归分析",
"can_validate": False,
"validation_note": "需原始数据",
},
"cox": {
"names": ["Cox回归", "比例风险模型", "生存分析"],
"category": "生存分析",
"can_validate": False,
"validation_note": "生存分析,复杂度高",
},
# 生存分析
"kaplan-meier": {
"names": ["Kaplan-Meier", "KM曲线", "生存曲线"],
"category": "生存分析",
"can_validate": False,
"validation_note": "图形方法",
},
"log-rank": {
"names": ["Log-rank", "对数秩检验"],
"category": "生存分析",
"can_validate": False,
"validation_note": "生存曲线比较",
},
# 相关分析
"pearson": {
"names": ["Pearson相关", "相关系数r", "积差相关"],
"category": "相关分析",
"can_validate": False,
"validation_note": "需原始数据",
},
"spearman": {
"names": ["Spearman相关", "秩相关", "等级相关"],
"category": "相关分析",
"can_validate": False,
"validation_note": "非参数相关",
},
# 诊断分析
"roc": {
"names": ["ROC曲线", "AUC", "曲线下面积", "受试者工作特征"],
"category": "诊断分析",
"can_validate": False,
"validation_note": "诊断准确性分析",
},
# 事后检验
"lsd": {
"names": ["LSD检验", "最小显著差异"],
"category": "事后检验",
"can_validate": False,
"validation_note": "ANOVA 事后比较",
},
"bonferroni": {
"names": ["Bonferroni", "校正"],
"category": "事后检验",
"can_validate": False,
"validation_note": "多重比较校正",
},
}
# 扩展正则模式 - 用于全面检测
EXTENDED_PATTERNS = {
"t-test": re.compile(r"(t[\s\-]?检验|t[\s\-]?test|student|独立样本t|两样本t|t\s*=\s*\d)", re.I),
"paired-t": re.compile(r"(配对[\s\-]?t|paired[\s\-]?t|前后对比)", re.I),
"chi-square": re.compile(r"(χ2|χ²|卡方|chi[\s\-]?square|fisher精确|fisher exact)", re.I),
"anova": re.compile(r"(方差分析|anova|f[\s\-]?检验|单因素|多因素|重复测量)", re.I),
"mann-whitney": re.compile(r"(mann[\s\-]?whitney|秩和检验|u[\s\-]?检验|非参数)", re.I),
"wilcoxon": re.compile(r"(wilcoxon符号秩|配对秩检验)", re.I),
"kruskal-wallis": re.compile(r"(kruskal[\s\-]?wallis|h检验)", re.I),
"logistic": re.compile(r"(logistic回归|logistic regression|二元回归|多因素logistic|logit)", re.I),
"linear": re.compile(r"(线性回归|多元回归|linear regression|ols)", re.I),
"cox": re.compile(r"(cox回归|cox regression|比例风险|proportional hazard)", re.I),
"kaplan-meier": re.compile(r"(kaplan[\s\-]?meier|km曲线|生存曲线)", re.I),
"log-rank": re.compile(r"(log[\s\-]?rank|对数秩)", re.I),
"pearson": re.compile(r"(pearson相关|相关系数r|积差相关|r\s*=\s*0\.\d)", re.I),
"spearman": re.compile(r"(spearman|秩相关|等级相关)", re.I),
"roc": re.compile(r"(roc曲线|auc|曲线下面积|受试者工作特征)", re.I),
"lsd": re.compile(r"(lsd检验|最小显著差异|事后lsd)", re.I),
"bonferroni": re.compile(r"(bonferroni|多重比较校正)", re.I),
}
def extract_full_text(file_path: Path) -> str:
"""提取 Word 文档全文"""
doc = Document(str(file_path))
paragraphs = [p.text for p in doc.paragraphs]
# 也提取表格中的文本
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
paragraphs.append(cell.text)
return "\n".join(paragraphs)
def detect_all_methods(text: str) -> dict:
"""使用扩展模式检测所有统计方法"""
found = {}
for method_name, pattern in EXTENDED_PATTERNS.items():
matches = pattern.findall(text)
if matches:
found[method_name] = list(set(matches)) # 去重
return found
def analyze_single_file(file_path: Path) -> dict:
"""分析单个文件"""
print(f"\n{'='*60}")
print(f"📄 {file_path.name[:50]}...")
print(f"{'='*60}")
# 提取全文
full_text = extract_full_text(file_path)
# 使用扩展模式检测(全面检测)
all_found = detect_all_methods(full_text)
# 使用系统模式检测(当前系统能力)
system_found = detect_methods(full_text)
print(f"\n📊 文档中使用的统计方法:")
for method, matches in sorted(all_found.items()):
info = ALL_KNOWN_METHODS.get(method, {})
category = info.get("category", "其他")
can_validate = info.get("can_validate", False)
# 检查系统是否能识别
in_system = method in system_found or method in ["paired-t", "logistic", "cox", "mann-whitney"]
status = "✅ 可验证" if can_validate else "⚠️ 仅识别"
detected = "🔍 已识别" if in_system else "❌ 未识别"
print(f" {method}: {matches[0][:30]}")
print(f" 类别: {category} | {detected} | {status}")
return {
"file": file_path.name,
"all_methods": list(all_found.keys()),
"system_detected": system_found,
"full_text_length": len(full_text),
}
def main():
"""主分析函数"""
print("=" * 70)
print("🔬 RVW V2.0 统计方法分析")
print("=" * 70)
# 获取所有测试文件
docx_files = list(TEST_DOCS_DIR.glob("*.docx"))
if not docx_files:
print(f"❌ 未找到测试文件")
return
print(f"\n📁 测试目录: {TEST_DOCS_DIR}")
print(f"📄 找到 {len(docx_files)} 个测试文件\n")
# 分析每个文件
all_methods_found = set()
system_detected_all = set()
results = []
for file_path in docx_files:
try:
result = analyze_single_file(file_path)
results.append(result)
all_methods_found.update(result["all_methods"])
system_detected_all.update(result["system_detected"])
except Exception as e:
print(f"❌ 分析失败: {e}")
# 汇总报告
print("\n" + "=" * 70)
print("📊 汇总分析")
print("=" * 70)
print(f"\n📈 统计方法覆盖情况:")
print(f" 文档中共出现: {len(all_methods_found)} 种统计方法")
print(f" 系统可识别: {len(system_detected_all)}")
# 详细分类
print("\n" + "-" * 50)
print("📋 详细分类:")
print("-" * 50)
# 分类统计
can_detect_and_validate = []
can_detect_only = []
cannot_detect = []
for method in sorted(all_methods_found):
info = ALL_KNOWN_METHODS.get(method, {})
can_validate = info.get("can_validate", False)
# 检查系统是否能识别
in_system = method in METHOD_PATTERNS
if in_system and can_validate:
can_detect_and_validate.append(method)
elif in_system:
can_detect_only.append(method)
else:
cannot_detect.append(method)
print("\n✅ 【可识别 + 可验证】MVP Week 2 实现):")
for m in can_detect_and_validate:
info = ALL_KNOWN_METHODS.get(m, {})
print(f"{m}: {info.get('validation_note', '')}")
print("\n⚠️ 【可识别但无法验证】V2.1+ 实现):")
for m in can_detect_only:
info = ALL_KNOWN_METHODS.get(m, {})
print(f"{m}: {info.get('validation_note', '')}")
print("\n❌ 【无法识别】(需扩展正则):")
for m in cannot_detect:
info = ALL_KNOWN_METHODS.get(m, {})
print(f"{m}: {info.get('category', '其他')}")
# 验证能力矩阵
print("\n" + "-" * 50)
print("📋 验证能力矩阵:")
print("-" * 50)
print("\n| 方法 | 可识别 | 可验证 | 实现阶段 |")
print("|------|--------|--------|----------|")
for method in sorted(all_methods_found):
info = ALL_KNOWN_METHODS.get(method, {})
in_system = method in METHOD_PATTERNS
can_validate = info.get("can_validate", False)
detect_str = "" if in_system else ""
validate_str = "" if can_validate else ""
stage = "MVP" if can_validate else "V2.1+"
print(f"| {method} | {detect_str} | {validate_str} | {stage} |")
if __name__ == "__main__":
main()