feat(rvw): Implement RVW V2.0 Data Forensics Module - Day 6 StatValidator

Summary:
- Implement L2 Statistical Validator (CI-P consistency, T-test reverse)
- Implement L2.5 Consistency Forensics (SE Triangle, SD>Mean check)
- Add error/warning severity classification with tolerance thresholds
- Support 5+ CI formats parsing (parentheses, brackets, 95% CI prefix)
- Complete Python forensics service (types, config, validator, extractor)

V2.0 Development Progress (Week 2 Day 6):
- Day 1-5: Python service setup, Word table extraction, L1 arithmetic validator
- Day 6: L2 StatValidator + L2.5 consistency forensics (promoted from V2.1)

Test Results:
- Unit tests: 4/4 passed (CI-P, SE Triangle, SD>Mean, T-test)
- Real document tests: 5/5 successful, 2 reasonable WARNINGs

Status: Day 6 completed, ready for Day 7 (Skills Framework)
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-02-17 22:15:27 +08:00
parent 7a299e8562
commit e785969e54
31 changed files with 5925 additions and 15 deletions

View File

@@ -0,0 +1,182 @@
"""
数据侦探模块 - 配置和常量
包含文件限制、正则表达式、默认配置等。
"""
import re
from typing import Dict, Pattern
# ==================== 文件限制 ====================
MAX_FILE_SIZE_MB = 20 # 最大文件大小MB
MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
MAX_TABLE_ROWS = 500 # 单表最大行数
MAX_TABLES_PER_DOC = 50 # 单文档最大表格数
ALLOWED_EXTENSIONS = {".docx"} # MVP 仅支持 .docx
# ==================== 正则表达式 ====================
# n (%) 格式匹配,如 "45 (50.0%)" 或 "45(50%)"
PERCENT_PATTERN = re.compile(
r"(\d+(?:\.\d+)?)\s*\(\s*(\d+(?:\.\d+)?)\s*%?\s*\)",
re.IGNORECASE
)
# P 值匹配,如 "P=0.05" 或 "p < 0.001" 或 "P值=0.05"
PVALUE_PATTERN = re.compile(
r"[Pp][\s\-值]*[=<>≤≥]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# 置信区间匹配,如 "95% CI: 1.2-2.5" 或 "(1.2, 2.5)"
CI_PATTERN = re.compile(
r"(?:95%?\s*CI[:\s]*)?[\(\[]?\s*(\d+\.?\d*)\s*[-,]\s*(\d+\.?\d*)\s*[\)\]]?",
re.IGNORECASE
)
# OR/HR/RR 匹配
EFFECT_SIZE_PATTERN = re.compile(
r"(?:OR|HR|RR)\s*[=:]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# ==================== 统计方法检测 ====================
METHOD_PATTERNS: Dict[str, Pattern] = {
"t-test": re.compile(
r"(t[\s\-]?test|t[\s\-]?检验|student.*test|independent.*sample|独立样本|两样本)",
re.IGNORECASE
),
"chi-square": re.compile(
r"(chi[\s\-]?square|χ2|χ²|卡方|pearson.*chi|fisher.*exact|fisher精确)",
re.IGNORECASE
),
"anova": re.compile(
r"(anova|analysis\s+of\s+variance|方差分析|单因素|多因素)",
re.IGNORECASE
),
"logistic": re.compile(
r"(logistic\s+regression|逻辑回归|二元回归|logit)",
re.IGNORECASE
),
"cox": re.compile(
r"(cox\s+regression|cox\s+proportional|生存分析|比例风险|kaplan[\s\-]?meier)",
re.IGNORECASE
),
"mann-whitney": re.compile(
r"(mann[\s\-]?whitney|wilcoxon|秩和检验|非参数)",
re.IGNORECASE
),
"paired-t": re.compile(
r"(paired[\s\-]?t|配对.*t|before[\s\-]?after)",
re.IGNORECASE
),
}
# ==================== 表格类型检测 ====================
# 基线特征表关键词
BASELINE_KEYWORDS = [
"baseline", "characteristics", "demographic", "基线", "特征", "人口学"
]
# 结局表关键词
OUTCOME_KEYWORDS = [
"outcome", "result", "efficacy", "endpoint", "结局", "疗效", "终点"
]
# ==================== 容错配置(终审建议) ====================
DEFAULT_TOLERANCE_PERCENT = 0.1 # 百分比容错 ±0.1%
# P 值容错阈值
PVALUE_ERROR_THRESHOLD = 0.05 # P 值差异 > 0.05 → Error严重矛盾
PVALUE_WARNING_THRESHOLD = 0.01 # P 值差异 > 0.01 → Warning可能舍入误差
PVALUE_RELATIVE_TOLERANCE = 0.05 # P 值相对误差 ±5%
# CI 容错阈值
CI_RELATIVE_TOLERANCE = 0.02 # CI 端点相对误差 ±2%
# 统计量容错
STAT_RELATIVE_TOLERANCE = 0.05 # t/χ² 值相对误差 ±5%
# ==================== Mean±SD 正则表达式 ====================
# Mean ± SD 格式,如 "45.2 ± 12.3" 或 "45.2±12.3" 或 "45.2 (12.3)"
MEAN_SD_PATTERN = re.compile(
r"(\d+\.?\d*)\s*[±\+\-]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# 带括号的 SD 格式,如 "45.2 (12.3)" - 用于某些表格
MEAN_SD_PAREN_PATTERN = re.compile(
r"(\d+\.?\d*)\s*\(\s*(\d+\.?\d*)\s*\)(?!\s*%)", # 排除百分比格式
re.IGNORECASE
)
# CI 格式清洗器(终审建议:处理多种分隔符)
CI_PATTERNS = [
# 标准格式: 2.5 (1.1-3.5) 或 2.5 [1.1-3.5]
re.compile(r"[\(\[]\s*(\d+\.?\d*)\s*[-–—,;]\s*(\d+\.?\d*)\s*[\)\]]", re.IGNORECASE),
# 带 CI 标签: 95% CI: 1.1-3.5 或 95%CI 1.1 to 3.5
re.compile(r"95%?\s*CI\s*[:\s]+(\d+\.?\d*)\s*[-–—,;to]+\s*(\d+\.?\d*)", re.IGNORECASE),
# 简单范围: 1.1-3.5(需要上下文判断)
re.compile(r"(\d+\.?\d*)\s*[-–—]\s*(\d+\.?\d*)", re.IGNORECASE),
]
# ==================== 验证函数 ====================
def validate_file_size(size_bytes: int) -> tuple[bool, str]:
"""
验证文件大小
Returns:
(is_valid, error_message)
"""
if size_bytes > MAX_FILE_SIZE_BYTES:
return False, f"文件大小 ({size_bytes / 1024 / 1024:.1f}MB) 超过限制 ({MAX_FILE_SIZE_MB}MB)"
return True, ""
def validate_file_extension(filename: str) -> tuple[bool, str]:
"""
验证文件扩展名
Returns:
(is_valid, error_message)
"""
from pathlib import Path
ext = Path(filename).suffix.lower()
if ext not in ALLOWED_EXTENSIONS:
if ext == ".doc":
return False, "暂不支持 .doc 格式,请使用 Word 另存为 .docx 格式后重新上传"
return False, f"不支持的文件格式: {ext},仅支持 .docx"
return True, ""
def detect_methods(text: str) -> list[str]:
"""
检测文本中的统计方法(正则优先)
Args:
text: 文档全文
Returns:
检测到的方法列表
"""
found = []
for method_name, pattern in METHOD_PATTERNS.items():
if pattern.search(text):
found.append(method_name)
return found