Files
HaHafeng f9ed0c2528 feat(rvw): Complete V2.0 Week 3 - Statistical validation extension and UX improvements
Week 3 Development Summary:

- Implement negative sign normalization (6 Unicode variants)

- Enhance T-test validation with smart sample size extraction

- Enhance SE triangle and CI-P consistency validation with subrow support

- Add precise sub-cell highlighting for P-values in multi-line cells

- Add frontend issue type Chinese translations (6 new types)

- Add file format tips for PDF/DOC uploads

Technical improvements:

- Add _clean_statistical_text() in extractor.py

- Add _safe_float() wrapper in validator.py

- Add ForensicsReport.tsx component

- Update ISSUE_TYPE_LABELS translations

Documentation:

- Add 2026-02-18 development record

- Update RVW module status (v5.1)

- Update system status (v5.2)

Status: Week 3 complete, ready for Week 4 testing
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-18 18:26:16 +08:00

189 lines
5.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
数据侦探模块 - 配置和常量
包含文件限制、正则表达式、默认配置等。
"""
import re
from typing import Dict, Pattern
# ==================== 文件限制 ====================
MAX_FILE_SIZE_MB = 20 # 最大文件大小MB
MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
MAX_TABLE_ROWS = 500 # 单表最大行数
MAX_TABLES_PER_DOC = 50 # 单文档最大表格数
ALLOWED_EXTENSIONS = {".docx"} # MVP 仅支持 .docx
# ==================== 正则表达式 ====================
# n (%) 格式匹配,如 "45 (50.0%)" 或 "45(50%)"
PERCENT_PATTERN = re.compile(
r"(\d+(?:\.\d+)?)\s*\(\s*(\d+(?:\.\d+)?)\s*%?\s*\)",
re.IGNORECASE
)
# P 值匹配,如 "P=0.05" 或 "p < 0.001" 或 "P值=0.05"
PVALUE_PATTERN = re.compile(
r"[Pp][\s\-值]*[=<>≤≥]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# 置信区间匹配,如 "95% CI: 1.2-2.5" 或 "(1.2, 2.5)"
CI_PATTERN = re.compile(
r"(?:95%?\s*CI[:\s]*)?[\(\[]?\s*(\d+\.?\d*)\s*[-,]\s*(\d+\.?\d*)\s*[\)\]]?",
re.IGNORECASE
)
# OR/HR/RR 匹配
EFFECT_SIZE_PATTERN = re.compile(
r"(?:OR|HR|RR)\s*[=:]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# 卡方值匹配,如 "χ²=57.519" 或 "2=57.519" 或 "χ2=57.519"
CHI_SQUARE_PATTERN = re.compile(
r"(?:χ[²2]|[χx]2|2)\s*[=:]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# ==================== 统计方法检测 ====================
METHOD_PATTERNS: Dict[str, Pattern] = {
"t-test": re.compile(
r"(t[\s\-]?test|t[\s\-]?检验|student.*test|independent.*sample|独立样本|两样本)",
re.IGNORECASE
),
"chi-square": re.compile(
r"(chi[\s\-]?square|χ2|χ²|卡方|pearson.*chi|fisher.*exact|fisher精确)",
re.IGNORECASE
),
"anova": re.compile(
r"(anova|analysis\s+of\s+variance|方差分析|单因素|多因素)",
re.IGNORECASE
),
"logistic": re.compile(
r"(logistic\s+regression|逻辑回归|二元回归|logit)",
re.IGNORECASE
),
"cox": re.compile(
r"(cox\s+regression|cox\s+proportional|生存分析|比例风险|kaplan[\s\-]?meier)",
re.IGNORECASE
),
"mann-whitney": re.compile(
r"(mann[\s\-]?whitney|wilcoxon|秩和检验|非参数)",
re.IGNORECASE
),
"paired-t": re.compile(
r"(paired[\s\-]?t|配对.*t|before[\s\-]?after)",
re.IGNORECASE
),
}
# ==================== 表格类型检测 ====================
# 基线特征表关键词
BASELINE_KEYWORDS = [
"baseline", "characteristics", "demographic", "基线", "特征", "人口学"
]
# 结局表关键词
OUTCOME_KEYWORDS = [
"outcome", "result", "efficacy", "endpoint", "结局", "疗效", "终点"
]
# ==================== 容错配置(终审建议) ====================
DEFAULT_TOLERANCE_PERCENT = 0.1 # 百分比容错 ±0.1%
# P 值容错阈值
PVALUE_ERROR_THRESHOLD = 0.05 # P 值差异 > 0.05 → Error严重矛盾
PVALUE_WARNING_THRESHOLD = 0.01 # P 值差异 > 0.01 → Warning可能舍入误差
PVALUE_RELATIVE_TOLERANCE = 0.05 # P 值相对误差 ±5%
# CI 容错阈值
CI_RELATIVE_TOLERANCE = 0.02 # CI 端点相对误差 ±2%
# 统计量容错
STAT_RELATIVE_TOLERANCE = 0.05 # t/χ² 值相对误差 ±5%
# ==================== Mean±SD 正则表达式 ====================
# Mean ± SD 格式,如 "45.2 ± 12.3" 或 "45.2±12.3" 或 "45.2 (12.3)"
MEAN_SD_PATTERN = re.compile(
r"(\d+\.?\d*)\s*[±\+\-]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# 带括号的 SD 格式,如 "45.2 (12.3)" - 用于某些表格
MEAN_SD_PAREN_PATTERN = re.compile(
r"(\d+\.?\d*)\s*\(\s*(\d+\.?\d*)\s*\)(?!\s*%)", # 排除百分比格式
re.IGNORECASE
)
# CI 格式清洗器(终审建议:处理多种分隔符)
CI_PATTERNS = [
# 标准格式: 2.5 (1.1-3.5) 或 2.5 [1.1-3.5]
re.compile(r"[\(\[]\s*(\d+\.?\d*)\s*[-–—,;]\s*(\d+\.?\d*)\s*[\)\]]", re.IGNORECASE),
# 带 CI 标签: 95% CI: 1.1-3.5 或 95%CI 1.1 to 3.5
re.compile(r"95%?\s*CI\s*[:\s]+(\d+\.?\d*)\s*[-–—,;to]+\s*(\d+\.?\d*)", re.IGNORECASE),
# 简单范围: 1.1-3.5(需要上下文判断)
re.compile(r"(\d+\.?\d*)\s*[-–—]\s*(\d+\.?\d*)", re.IGNORECASE),
]
# ==================== 验证函数 ====================
def validate_file_size(size_bytes: int) -> tuple[bool, str]:
"""
验证文件大小
Returns:
(is_valid, error_message)
"""
if size_bytes > MAX_FILE_SIZE_BYTES:
return False, f"文件大小 ({size_bytes / 1024 / 1024:.1f}MB) 超过限制 ({MAX_FILE_SIZE_MB}MB)"
return True, ""
def validate_file_extension(filename: str) -> tuple[bool, str]:
"""
验证文件扩展名
Returns:
(is_valid, error_message)
"""
from pathlib import Path
ext = Path(filename).suffix.lower()
if ext not in ALLOWED_EXTENSIONS:
if ext == ".doc":
return False, "暂不支持 .doc 格式,请使用 Word 另存为 .docx 格式后重新上传"
return False, f"不支持的文件格式: {ext},仅支持 .docx"
return True, ""
def detect_methods(text: str) -> list[str]:
"""
检测文本中的统计方法(正则优先)
Args:
text: 文档全文
Returns:
检测到的方法列表
"""
found = []
for method_name, pattern in METHOD_PATTERNS.items():
if pattern.search(text):
found.append(method_name)
return found