""" 统计方法分析脚本 分析测试文档中的统计方法: 1. 文档中实际使用了哪些方法 2. 我们的系统能识别哪些 3. 识别出来的哪些可以验证 """ import os import sys import re from pathlib import Path from docx import Document # 添加项目路径 sys.path.insert(0, str(Path(__file__).parent)) from forensics.config import METHOD_PATTERNS, detect_methods # 测试文件目录 TEST_DOCS_DIR = Path(__file__).parent.parent / "docs" / "03-业务模块" / "RVW-稿件审查系统" / "05-测试文档" # ==================== 完整的统计方法列表 ==================== # 医学研究论文中常见的统计方法 ALL_KNOWN_METHODS = { # 参数检验 "t-test": { "names": ["t检验", "t-test", "student t", "独立样本t", "两样本t"], "category": "参数检验", "can_validate": True, # Week 2 实现 T检验逆向验证 "validation_note": "根据均值、标准差、样本量反推 t 值", }, "paired-t": { "names": ["配对t", "paired t", "前后对比"], "category": "参数检验", "can_validate": False, # V2.1 实现 "validation_note": "需要配对数据,MVP 不支持", }, "anova": { "names": ["方差分析", "ANOVA", "F检验", "单因素方差分析", "多因素方差分析", "重复测量方差分析"], "category": "参数检验", "can_validate": False, # V2.1 实现 "validation_note": "多组比较,复杂度高,MVP 不支持", }, # 非参数检验 "chi-square": { "names": ["卡方检验", "χ²", "χ2", "chi-square", "pearson卡方", "Fisher精确检验"], "category": "非参数检验", "can_validate": True, # Week 2 实现卡方检验逆向验证 "validation_note": "根据频数表反推卡方值", }, "mann-whitney": { "names": ["Mann-Whitney", "秩和检验", "U检验", "Wilcoxon秩和"], "category": "非参数检验", "can_validate": False, # V2.1 实现 "validation_note": "非参数检验,需原始数据", }, "wilcoxon": { "names": ["Wilcoxon符号秩", "配对秩"], "category": "非参数检验", "can_validate": False, "validation_note": "配对非参数检验", }, "kruskal-wallis": { "names": ["Kruskal-Wallis", "H检验"], "category": "非参数检验", "can_validate": False, "validation_note": "多组非参数比较", }, # 回归分析 "logistic": { "names": ["Logistic回归", "logit", "二元回归", "多因素logistic"], "category": "回归分析", "can_validate": False, # V2.1 实现 "validation_note": "复杂模型,需原始数据", }, "linear": { "names": ["线性回归", "多元回归", "OLS"], "category": "回归分析", "can_validate": False, "validation_note": "需原始数据", }, "cox": { "names": ["Cox回归", "比例风险模型", "生存分析"], "category": "生存分析", "can_validate": False, "validation_note": "生存分析,复杂度高", }, # 生存分析 "kaplan-meier": { "names": ["Kaplan-Meier", "KM曲线", "生存曲线"], "category": "生存分析", "can_validate": False, "validation_note": "图形方法", }, "log-rank": { "names": ["Log-rank", "对数秩检验"], "category": "生存分析", "can_validate": False, "validation_note": "生存曲线比较", }, # 相关分析 "pearson": { "names": ["Pearson相关", "相关系数r", "积差相关"], "category": "相关分析", "can_validate": False, "validation_note": "需原始数据", }, "spearman": { "names": ["Spearman相关", "秩相关", "等级相关"], "category": "相关分析", "can_validate": False, "validation_note": "非参数相关", }, # 诊断分析 "roc": { "names": ["ROC曲线", "AUC", "曲线下面积", "受试者工作特征"], "category": "诊断分析", "can_validate": False, "validation_note": "诊断准确性分析", }, # 事后检验 "lsd": { "names": ["LSD检验", "最小显著差异"], "category": "事后检验", "can_validate": False, "validation_note": "ANOVA 事后比较", }, "bonferroni": { "names": ["Bonferroni", "校正"], "category": "事后检验", "can_validate": False, "validation_note": "多重比较校正", }, } # 扩展正则模式 - 用于全面检测 EXTENDED_PATTERNS = { "t-test": re.compile(r"(t[\s\-]?检验|t[\s\-]?test|student|独立样本t|两样本t|t\s*=\s*\d)", re.I), "paired-t": re.compile(r"(配对[\s\-]?t|paired[\s\-]?t|前后对比)", re.I), "chi-square": re.compile(r"(χ2|χ²|卡方|chi[\s\-]?square|fisher精确|fisher exact)", re.I), "anova": re.compile(r"(方差分析|anova|f[\s\-]?检验|单因素|多因素|重复测量)", re.I), "mann-whitney": re.compile(r"(mann[\s\-]?whitney|秩和检验|u[\s\-]?检验|非参数)", re.I), "wilcoxon": re.compile(r"(wilcoxon符号秩|配对秩检验)", re.I), "kruskal-wallis": re.compile(r"(kruskal[\s\-]?wallis|h检验)", re.I), "logistic": re.compile(r"(logistic回归|logistic regression|二元回归|多因素logistic|logit)", re.I), "linear": re.compile(r"(线性回归|多元回归|linear regression|ols)", re.I), "cox": re.compile(r"(cox回归|cox regression|比例风险|proportional hazard)", re.I), "kaplan-meier": re.compile(r"(kaplan[\s\-]?meier|km曲线|生存曲线)", re.I), "log-rank": re.compile(r"(log[\s\-]?rank|对数秩)", re.I), "pearson": re.compile(r"(pearson相关|相关系数r|积差相关|r\s*=\s*0\.\d)", re.I), "spearman": re.compile(r"(spearman|秩相关|等级相关)", re.I), "roc": re.compile(r"(roc曲线|auc|曲线下面积|受试者工作特征)", re.I), "lsd": re.compile(r"(lsd检验|最小显著差异|事后lsd)", re.I), "bonferroni": re.compile(r"(bonferroni|多重比较校正)", re.I), } def extract_full_text(file_path: Path) -> str: """提取 Word 文档全文""" doc = Document(str(file_path)) paragraphs = [p.text for p in doc.paragraphs] # 也提取表格中的文本 for table in doc.tables: for row in table.rows: for cell in row.cells: paragraphs.append(cell.text) return "\n".join(paragraphs) def detect_all_methods(text: str) -> dict: """使用扩展模式检测所有统计方法""" found = {} for method_name, pattern in EXTENDED_PATTERNS.items(): matches = pattern.findall(text) if matches: found[method_name] = list(set(matches)) # 去重 return found def analyze_single_file(file_path: Path) -> dict: """分析单个文件""" print(f"\n{'='*60}") print(f"📄 {file_path.name[:50]}...") print(f"{'='*60}") # 提取全文 full_text = extract_full_text(file_path) # 使用扩展模式检测(全面检测) all_found = detect_all_methods(full_text) # 使用系统模式检测(当前系统能力) system_found = detect_methods(full_text) print(f"\n📊 文档中使用的统计方法:") for method, matches in sorted(all_found.items()): info = ALL_KNOWN_METHODS.get(method, {}) category = info.get("category", "其他") can_validate = info.get("can_validate", False) # 检查系统是否能识别 in_system = method in system_found or method in ["paired-t", "logistic", "cox", "mann-whitney"] status = "✅ 可验证" if can_validate else "⚠️ 仅识别" detected = "🔍 已识别" if in_system else "❌ 未识别" print(f" {method}: {matches[0][:30]}") print(f" 类别: {category} | {detected} | {status}") return { "file": file_path.name, "all_methods": list(all_found.keys()), "system_detected": system_found, "full_text_length": len(full_text), } def main(): """主分析函数""" print("=" * 70) print("🔬 RVW V2.0 统计方法分析") print("=" * 70) # 获取所有测试文件 docx_files = list(TEST_DOCS_DIR.glob("*.docx")) if not docx_files: print(f"❌ 未找到测试文件") return print(f"\n📁 测试目录: {TEST_DOCS_DIR}") print(f"📄 找到 {len(docx_files)} 个测试文件\n") # 分析每个文件 all_methods_found = set() system_detected_all = set() results = [] for file_path in docx_files: try: result = analyze_single_file(file_path) results.append(result) all_methods_found.update(result["all_methods"]) system_detected_all.update(result["system_detected"]) except Exception as e: print(f"❌ 分析失败: {e}") # 汇总报告 print("\n" + "=" * 70) print("📊 汇总分析") print("=" * 70) print(f"\n📈 统计方法覆盖情况:") print(f" 文档中共出现: {len(all_methods_found)} 种统计方法") print(f" 系统可识别: {len(system_detected_all)} 种") # 详细分类 print("\n" + "-" * 50) print("📋 详细分类:") print("-" * 50) # 分类统计 can_detect_and_validate = [] can_detect_only = [] cannot_detect = [] for method in sorted(all_methods_found): info = ALL_KNOWN_METHODS.get(method, {}) can_validate = info.get("can_validate", False) # 检查系统是否能识别 in_system = method in METHOD_PATTERNS if in_system and can_validate: can_detect_and_validate.append(method) elif in_system: can_detect_only.append(method) else: cannot_detect.append(method) print("\n✅ 【可识别 + 可验证】(MVP Week 2 实现):") for m in can_detect_and_validate: info = ALL_KNOWN_METHODS.get(m, {}) print(f" • {m}: {info.get('validation_note', '')}") print("\n⚠️ 【可识别,但无法验证】(V2.1+ 实现):") for m in can_detect_only: info = ALL_KNOWN_METHODS.get(m, {}) print(f" • {m}: {info.get('validation_note', '')}") print("\n❌ 【无法识别】(需扩展正则):") for m in cannot_detect: info = ALL_KNOWN_METHODS.get(m, {}) print(f" • {m}: {info.get('category', '其他')}") # 验证能力矩阵 print("\n" + "-" * 50) print("📋 验证能力矩阵:") print("-" * 50) print("\n| 方法 | 可识别 | 可验证 | 实现阶段 |") print("|------|--------|--------|----------|") for method in sorted(all_methods_found): info = ALL_KNOWN_METHODS.get(method, {}) in_system = method in METHOD_PATTERNS can_validate = info.get("can_validate", False) detect_str = "✅" if in_system else "❌" validate_str = "✅" if can_validate else "❌" stage = "MVP" if can_validate else "V2.1+" print(f"| {method} | {detect_str} | {validate_str} | {stage} |") if __name__ == "__main__": main()