Files
AIclinicalresearch/extraction_service/forensics/config.py
HaHafeng e785969e54 feat(rvw): Implement RVW V2.0 Data Forensics Module - Day 6 StatValidator
Summary:
- Implement L2 Statistical Validator (CI-P consistency, T-test reverse)
- Implement L2.5 Consistency Forensics (SE Triangle, SD>Mean check)
- Add error/warning severity classification with tolerance thresholds
- Support 5+ CI formats parsing (parentheses, brackets, 95% CI prefix)
- Complete Python forensics service (types, config, validator, extractor)

V2.0 Development Progress (Week 2 Day 6):
- Day 1-5: Python service setup, Word table extraction, L1 arithmetic validator
- Day 6: L2 StatValidator + L2.5 consistency forensics (promoted from V2.1)

Test Results:
- Unit tests: 4/4 passed (CI-P, SE Triangle, SD>Mean, T-test)
- Real document tests: 5/5 successful, 2 reasonable WARNINGs

Status: Day 6 completed, ready for Day 7 (Skills Framework)
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-17 22:15:27 +08:00

183 lines
5.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
数据侦探模块 - 配置和常量
包含文件限制、正则表达式、默认配置等。
"""
import re
from typing import Dict, Pattern
# ==================== 文件限制 ====================
MAX_FILE_SIZE_MB = 20 # 最大文件大小MB
MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
MAX_TABLE_ROWS = 500 # 单表最大行数
MAX_TABLES_PER_DOC = 50 # 单文档最大表格数
ALLOWED_EXTENSIONS = {".docx"} # MVP 仅支持 .docx
# ==================== 正则表达式 ====================
# n (%) 格式匹配,如 "45 (50.0%)" 或 "45(50%)"
PERCENT_PATTERN = re.compile(
r"(\d+(?:\.\d+)?)\s*\(\s*(\d+(?:\.\d+)?)\s*%?\s*\)",
re.IGNORECASE
)
# P 值匹配,如 "P=0.05" 或 "p < 0.001" 或 "P值=0.05"
PVALUE_PATTERN = re.compile(
r"[Pp][\s\-值]*[=<>≤≥]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# 置信区间匹配,如 "95% CI: 1.2-2.5" 或 "(1.2, 2.5)"
CI_PATTERN = re.compile(
r"(?:95%?\s*CI[:\s]*)?[\(\[]?\s*(\d+\.?\d*)\s*[-,]\s*(\d+\.?\d*)\s*[\)\]]?",
re.IGNORECASE
)
# OR/HR/RR 匹配
EFFECT_SIZE_PATTERN = re.compile(
r"(?:OR|HR|RR)\s*[=:]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# ==================== 统计方法检测 ====================
METHOD_PATTERNS: Dict[str, Pattern] = {
"t-test": re.compile(
r"(t[\s\-]?test|t[\s\-]?检验|student.*test|independent.*sample|独立样本|两样本)",
re.IGNORECASE
),
"chi-square": re.compile(
r"(chi[\s\-]?square|χ2|χ²|卡方|pearson.*chi|fisher.*exact|fisher精确)",
re.IGNORECASE
),
"anova": re.compile(
r"(anova|analysis\s+of\s+variance|方差分析|单因素|多因素)",
re.IGNORECASE
),
"logistic": re.compile(
r"(logistic\s+regression|逻辑回归|二元回归|logit)",
re.IGNORECASE
),
"cox": re.compile(
r"(cox\s+regression|cox\s+proportional|生存分析|比例风险|kaplan[\s\-]?meier)",
re.IGNORECASE
),
"mann-whitney": re.compile(
r"(mann[\s\-]?whitney|wilcoxon|秩和检验|非参数)",
re.IGNORECASE
),
"paired-t": re.compile(
r"(paired[\s\-]?t|配对.*t|before[\s\-]?after)",
re.IGNORECASE
),
}
# ==================== 表格类型检测 ====================
# 基线特征表关键词
BASELINE_KEYWORDS = [
"baseline", "characteristics", "demographic", "基线", "特征", "人口学"
]
# 结局表关键词
OUTCOME_KEYWORDS = [
"outcome", "result", "efficacy", "endpoint", "结局", "疗效", "终点"
]
# ==================== 容错配置(终审建议) ====================
DEFAULT_TOLERANCE_PERCENT = 0.1 # 百分比容错 ±0.1%
# P 值容错阈值
PVALUE_ERROR_THRESHOLD = 0.05 # P 值差异 > 0.05 → Error严重矛盾
PVALUE_WARNING_THRESHOLD = 0.01 # P 值差异 > 0.01 → Warning可能舍入误差
PVALUE_RELATIVE_TOLERANCE = 0.05 # P 值相对误差 ±5%
# CI 容错阈值
CI_RELATIVE_TOLERANCE = 0.02 # CI 端点相对误差 ±2%
# 统计量容错
STAT_RELATIVE_TOLERANCE = 0.05 # t/χ² 值相对误差 ±5%
# ==================== Mean±SD 正则表达式 ====================
# Mean ± SD 格式,如 "45.2 ± 12.3" 或 "45.2±12.3" 或 "45.2 (12.3)"
MEAN_SD_PATTERN = re.compile(
r"(\d+\.?\d*)\s*[±\+\-]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# 带括号的 SD 格式,如 "45.2 (12.3)" - 用于某些表格
MEAN_SD_PAREN_PATTERN = re.compile(
r"(\d+\.?\d*)\s*\(\s*(\d+\.?\d*)\s*\)(?!\s*%)", # 排除百分比格式
re.IGNORECASE
)
# CI 格式清洗器(终审建议:处理多种分隔符)
CI_PATTERNS = [
# 标准格式: 2.5 (1.1-3.5) 或 2.5 [1.1-3.5]
re.compile(r"[\(\[]\s*(\d+\.?\d*)\s*[-–—,;]\s*(\d+\.?\d*)\s*[\)\]]", re.IGNORECASE),
# 带 CI 标签: 95% CI: 1.1-3.5 或 95%CI 1.1 to 3.5
re.compile(r"95%?\s*CI\s*[:\s]+(\d+\.?\d*)\s*[-–—,;to]+\s*(\d+\.?\d*)", re.IGNORECASE),
# 简单范围: 1.1-3.5(需要上下文判断)
re.compile(r"(\d+\.?\d*)\s*[-–—]\s*(\d+\.?\d*)", re.IGNORECASE),
]
# ==================== 验证函数 ====================
def validate_file_size(size_bytes: int) -> tuple[bool, str]:
"""
验证文件大小
Returns:
(is_valid, error_message)
"""
if size_bytes > MAX_FILE_SIZE_BYTES:
return False, f"文件大小 ({size_bytes / 1024 / 1024:.1f}MB) 超过限制 ({MAX_FILE_SIZE_MB}MB)"
return True, ""
def validate_file_extension(filename: str) -> tuple[bool, str]:
"""
验证文件扩展名
Returns:
(is_valid, error_message)
"""
from pathlib import Path
ext = Path(filename).suffix.lower()
if ext not in ALLOWED_EXTENSIONS:
if ext == ".doc":
return False, "暂不支持 .doc 格式,请使用 Word 另存为 .docx 格式后重新上传"
return False, f"不支持的文件格式: {ext},仅支持 .docx"
return True, ""
def detect_methods(text: str) -> list[str]:
"""
检测文本中的统计方法(正则优先)
Args:
text: 文档全文
Returns:
检测到的方法列表
"""
found = []
for method_name, pattern in METHOD_PATTERNS.items():
if pattern.search(text):
found.append(method_name)
return found