Summary: - Implement L2 Statistical Validator (CI-P consistency, T-test reverse) - Implement L2.5 Consistency Forensics (SE Triangle, SD>Mean check) - Add error/warning severity classification with tolerance thresholds - Support 5+ CI formats parsing (parentheses, brackets, 95% CI prefix) - Complete Python forensics service (types, config, validator, extractor) V2.0 Development Progress (Week 2 Day 6): - Day 1-5: Python service setup, Word table extraction, L1 arithmetic validator - Day 6: L2 StatValidator + L2.5 consistency forensics (promoted from V2.1) Test Results: - Unit tests: 4/4 passed (CI-P, SE Triangle, SD>Mean, T-test) - Real document tests: 5/5 successful, 2 reasonable WARNINGs Status: Day 6 completed, ready for Day 7 (Skills Framework) Co-authored-by: Cursor <cursoragent@cursor.com>
329 lines
11 KiB
Python
329 lines
11 KiB
Python
"""
|
||
统计方法分析脚本
|
||
|
||
分析测试文档中的统计方法:
|
||
1. 文档中实际使用了哪些方法
|
||
2. 我们的系统能识别哪些
|
||
3. 识别出来的哪些可以验证
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
import re
|
||
from pathlib import Path
|
||
from docx import Document
|
||
|
||
# 添加项目路径
|
||
sys.path.insert(0, str(Path(__file__).parent))
|
||
|
||
from forensics.config import METHOD_PATTERNS, detect_methods
|
||
|
||
# 测试文件目录
|
||
TEST_DOCS_DIR = Path(__file__).parent.parent / "docs" / "03-业务模块" / "RVW-稿件审查系统" / "05-测试文档"
|
||
|
||
|
||
# ==================== 完整的统计方法列表 ====================
|
||
# 医学研究论文中常见的统计方法
|
||
|
||
ALL_KNOWN_METHODS = {
|
||
# 参数检验
|
||
"t-test": {
|
||
"names": ["t检验", "t-test", "student t", "独立样本t", "两样本t"],
|
||
"category": "参数检验",
|
||
"can_validate": True, # Week 2 实现 T检验逆向验证
|
||
"validation_note": "根据均值、标准差、样本量反推 t 值",
|
||
},
|
||
"paired-t": {
|
||
"names": ["配对t", "paired t", "前后对比"],
|
||
"category": "参数检验",
|
||
"can_validate": False, # V2.1 实现
|
||
"validation_note": "需要配对数据,MVP 不支持",
|
||
},
|
||
"anova": {
|
||
"names": ["方差分析", "ANOVA", "F检验", "单因素方差分析", "多因素方差分析", "重复测量方差分析"],
|
||
"category": "参数检验",
|
||
"can_validate": False, # V2.1 实现
|
||
"validation_note": "多组比较,复杂度高,MVP 不支持",
|
||
},
|
||
|
||
# 非参数检验
|
||
"chi-square": {
|
||
"names": ["卡方检验", "χ²", "χ2", "chi-square", "pearson卡方", "Fisher精确检验"],
|
||
"category": "非参数检验",
|
||
"can_validate": True, # Week 2 实现卡方检验逆向验证
|
||
"validation_note": "根据频数表反推卡方值",
|
||
},
|
||
"mann-whitney": {
|
||
"names": ["Mann-Whitney", "秩和检验", "U检验", "Wilcoxon秩和"],
|
||
"category": "非参数检验",
|
||
"can_validate": False, # V2.1 实现
|
||
"validation_note": "非参数检验,需原始数据",
|
||
},
|
||
"wilcoxon": {
|
||
"names": ["Wilcoxon符号秩", "配对秩"],
|
||
"category": "非参数检验",
|
||
"can_validate": False,
|
||
"validation_note": "配对非参数检验",
|
||
},
|
||
"kruskal-wallis": {
|
||
"names": ["Kruskal-Wallis", "H检验"],
|
||
"category": "非参数检验",
|
||
"can_validate": False,
|
||
"validation_note": "多组非参数比较",
|
||
},
|
||
|
||
# 回归分析
|
||
"logistic": {
|
||
"names": ["Logistic回归", "logit", "二元回归", "多因素logistic"],
|
||
"category": "回归分析",
|
||
"can_validate": False, # V2.1 实现
|
||
"validation_note": "复杂模型,需原始数据",
|
||
},
|
||
"linear": {
|
||
"names": ["线性回归", "多元回归", "OLS"],
|
||
"category": "回归分析",
|
||
"can_validate": False,
|
||
"validation_note": "需原始数据",
|
||
},
|
||
"cox": {
|
||
"names": ["Cox回归", "比例风险模型", "生存分析"],
|
||
"category": "生存分析",
|
||
"can_validate": False,
|
||
"validation_note": "生存分析,复杂度高",
|
||
},
|
||
|
||
# 生存分析
|
||
"kaplan-meier": {
|
||
"names": ["Kaplan-Meier", "KM曲线", "生存曲线"],
|
||
"category": "生存分析",
|
||
"can_validate": False,
|
||
"validation_note": "图形方法",
|
||
},
|
||
"log-rank": {
|
||
"names": ["Log-rank", "对数秩检验"],
|
||
"category": "生存分析",
|
||
"can_validate": False,
|
||
"validation_note": "生存曲线比较",
|
||
},
|
||
|
||
# 相关分析
|
||
"pearson": {
|
||
"names": ["Pearson相关", "相关系数r", "积差相关"],
|
||
"category": "相关分析",
|
||
"can_validate": False,
|
||
"validation_note": "需原始数据",
|
||
},
|
||
"spearman": {
|
||
"names": ["Spearman相关", "秩相关", "等级相关"],
|
||
"category": "相关分析",
|
||
"can_validate": False,
|
||
"validation_note": "非参数相关",
|
||
},
|
||
|
||
# 诊断分析
|
||
"roc": {
|
||
"names": ["ROC曲线", "AUC", "曲线下面积", "受试者工作特征"],
|
||
"category": "诊断分析",
|
||
"can_validate": False,
|
||
"validation_note": "诊断准确性分析",
|
||
},
|
||
|
||
# 事后检验
|
||
"lsd": {
|
||
"names": ["LSD检验", "最小显著差异"],
|
||
"category": "事后检验",
|
||
"can_validate": False,
|
||
"validation_note": "ANOVA 事后比较",
|
||
},
|
||
"bonferroni": {
|
||
"names": ["Bonferroni", "校正"],
|
||
"category": "事后检验",
|
||
"can_validate": False,
|
||
"validation_note": "多重比较校正",
|
||
},
|
||
}
|
||
|
||
# 扩展正则模式 - 用于全面检测
|
||
EXTENDED_PATTERNS = {
|
||
"t-test": re.compile(r"(t[\s\-]?检验|t[\s\-]?test|student|独立样本t|两样本t|t\s*=\s*\d)", re.I),
|
||
"paired-t": re.compile(r"(配对[\s\-]?t|paired[\s\-]?t|前后对比)", re.I),
|
||
"chi-square": re.compile(r"(χ2|χ²|卡方|chi[\s\-]?square|fisher精确|fisher exact)", re.I),
|
||
"anova": re.compile(r"(方差分析|anova|f[\s\-]?检验|单因素|多因素|重复测量)", re.I),
|
||
"mann-whitney": re.compile(r"(mann[\s\-]?whitney|秩和检验|u[\s\-]?检验|非参数)", re.I),
|
||
"wilcoxon": re.compile(r"(wilcoxon符号秩|配对秩检验)", re.I),
|
||
"kruskal-wallis": re.compile(r"(kruskal[\s\-]?wallis|h检验)", re.I),
|
||
"logistic": re.compile(r"(logistic回归|logistic regression|二元回归|多因素logistic|logit)", re.I),
|
||
"linear": re.compile(r"(线性回归|多元回归|linear regression|ols)", re.I),
|
||
"cox": re.compile(r"(cox回归|cox regression|比例风险|proportional hazard)", re.I),
|
||
"kaplan-meier": re.compile(r"(kaplan[\s\-]?meier|km曲线|生存曲线)", re.I),
|
||
"log-rank": re.compile(r"(log[\s\-]?rank|对数秩)", re.I),
|
||
"pearson": re.compile(r"(pearson相关|相关系数r|积差相关|r\s*=\s*0\.\d)", re.I),
|
||
"spearman": re.compile(r"(spearman|秩相关|等级相关)", re.I),
|
||
"roc": re.compile(r"(roc曲线|auc|曲线下面积|受试者工作特征)", re.I),
|
||
"lsd": re.compile(r"(lsd检验|最小显著差异|事后lsd)", re.I),
|
||
"bonferroni": re.compile(r"(bonferroni|多重比较校正)", re.I),
|
||
}
|
||
|
||
|
||
def extract_full_text(file_path: Path) -> str:
|
||
"""提取 Word 文档全文"""
|
||
doc = Document(str(file_path))
|
||
paragraphs = [p.text for p in doc.paragraphs]
|
||
|
||
# 也提取表格中的文本
|
||
for table in doc.tables:
|
||
for row in table.rows:
|
||
for cell in row.cells:
|
||
paragraphs.append(cell.text)
|
||
|
||
return "\n".join(paragraphs)
|
||
|
||
|
||
def detect_all_methods(text: str) -> dict:
|
||
"""使用扩展模式检测所有统计方法"""
|
||
found = {}
|
||
for method_name, pattern in EXTENDED_PATTERNS.items():
|
||
matches = pattern.findall(text)
|
||
if matches:
|
||
found[method_name] = list(set(matches)) # 去重
|
||
return found
|
||
|
||
|
||
def analyze_single_file(file_path: Path) -> dict:
|
||
"""分析单个文件"""
|
||
print(f"\n{'='*60}")
|
||
print(f"📄 {file_path.name[:50]}...")
|
||
print(f"{'='*60}")
|
||
|
||
# 提取全文
|
||
full_text = extract_full_text(file_path)
|
||
|
||
# 使用扩展模式检测(全面检测)
|
||
all_found = detect_all_methods(full_text)
|
||
|
||
# 使用系统模式检测(当前系统能力)
|
||
system_found = detect_methods(full_text)
|
||
|
||
print(f"\n📊 文档中使用的统计方法:")
|
||
for method, matches in sorted(all_found.items()):
|
||
info = ALL_KNOWN_METHODS.get(method, {})
|
||
category = info.get("category", "其他")
|
||
can_validate = info.get("can_validate", False)
|
||
|
||
# 检查系统是否能识别
|
||
in_system = method in system_found or method in ["paired-t", "logistic", "cox", "mann-whitney"]
|
||
|
||
status = "✅ 可验证" if can_validate else "⚠️ 仅识别"
|
||
detected = "🔍 已识别" if in_system else "❌ 未识别"
|
||
|
||
print(f" {method}: {matches[0][:30]}")
|
||
print(f" 类别: {category} | {detected} | {status}")
|
||
|
||
return {
|
||
"file": file_path.name,
|
||
"all_methods": list(all_found.keys()),
|
||
"system_detected": system_found,
|
||
"full_text_length": len(full_text),
|
||
}
|
||
|
||
|
||
def main():
|
||
"""主分析函数"""
|
||
print("=" * 70)
|
||
print("🔬 RVW V2.0 统计方法分析")
|
||
print("=" * 70)
|
||
|
||
# 获取所有测试文件
|
||
docx_files = list(TEST_DOCS_DIR.glob("*.docx"))
|
||
|
||
if not docx_files:
|
||
print(f"❌ 未找到测试文件")
|
||
return
|
||
|
||
print(f"\n📁 测试目录: {TEST_DOCS_DIR}")
|
||
print(f"📄 找到 {len(docx_files)} 个测试文件\n")
|
||
|
||
# 分析每个文件
|
||
all_methods_found = set()
|
||
system_detected_all = set()
|
||
results = []
|
||
|
||
for file_path in docx_files:
|
||
try:
|
||
result = analyze_single_file(file_path)
|
||
results.append(result)
|
||
all_methods_found.update(result["all_methods"])
|
||
system_detected_all.update(result["system_detected"])
|
||
except Exception as e:
|
||
print(f"❌ 分析失败: {e}")
|
||
|
||
# 汇总报告
|
||
print("\n" + "=" * 70)
|
||
print("📊 汇总分析")
|
||
print("=" * 70)
|
||
|
||
print(f"\n📈 统计方法覆盖情况:")
|
||
print(f" 文档中共出现: {len(all_methods_found)} 种统计方法")
|
||
print(f" 系统可识别: {len(system_detected_all)} 种")
|
||
|
||
# 详细分类
|
||
print("\n" + "-" * 50)
|
||
print("📋 详细分类:")
|
||
print("-" * 50)
|
||
|
||
# 分类统计
|
||
can_detect_and_validate = []
|
||
can_detect_only = []
|
||
cannot_detect = []
|
||
|
||
for method in sorted(all_methods_found):
|
||
info = ALL_KNOWN_METHODS.get(method, {})
|
||
can_validate = info.get("can_validate", False)
|
||
|
||
# 检查系统是否能识别
|
||
in_system = method in METHOD_PATTERNS
|
||
|
||
if in_system and can_validate:
|
||
can_detect_and_validate.append(method)
|
||
elif in_system:
|
||
can_detect_only.append(method)
|
||
else:
|
||
cannot_detect.append(method)
|
||
|
||
print("\n✅ 【可识别 + 可验证】(MVP Week 2 实现):")
|
||
for m in can_detect_and_validate:
|
||
info = ALL_KNOWN_METHODS.get(m, {})
|
||
print(f" • {m}: {info.get('validation_note', '')}")
|
||
|
||
print("\n⚠️ 【可识别,但无法验证】(V2.1+ 实现):")
|
||
for m in can_detect_only:
|
||
info = ALL_KNOWN_METHODS.get(m, {})
|
||
print(f" • {m}: {info.get('validation_note', '')}")
|
||
|
||
print("\n❌ 【无法识别】(需扩展正则):")
|
||
for m in cannot_detect:
|
||
info = ALL_KNOWN_METHODS.get(m, {})
|
||
print(f" • {m}: {info.get('category', '其他')}")
|
||
|
||
# 验证能力矩阵
|
||
print("\n" + "-" * 50)
|
||
print("📋 验证能力矩阵:")
|
||
print("-" * 50)
|
||
print("\n| 方法 | 可识别 | 可验证 | 实现阶段 |")
|
||
print("|------|--------|--------|----------|")
|
||
|
||
for method in sorted(all_methods_found):
|
||
info = ALL_KNOWN_METHODS.get(method, {})
|
||
in_system = method in METHOD_PATTERNS
|
||
can_validate = info.get("can_validate", False)
|
||
|
||
detect_str = "✅" if in_system else "❌"
|
||
validate_str = "✅" if can_validate else "❌"
|
||
stage = "MVP" if can_validate else "V2.1+"
|
||
|
||
print(f"| {method} | {detect_str} | {validate_str} | {stage} |")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|