Week 3 Development Summary: - Implement negative sign normalization (6 Unicode variants) - Enhance T-test validation with smart sample size extraction - Enhance SE triangle and CI-P consistency validation with subrow support - Add precise sub-cell highlighting for P-values in multi-line cells - Add frontend issue type Chinese translations (6 new types) - Add file format tips for PDF/DOC uploads Technical improvements: - Add _clean_statistical_text() in extractor.py - Add _safe_float() wrapper in validator.py - Add ForensicsReport.tsx component - Update ISSUE_TYPE_LABELS translations Documentation: - Add 2026-02-18 development record - Update RVW module status (v5.1) - Update system status (v5.2) Status: Week 3 complete, ready for Week 4 testing Co-authored-by: Cursor <cursoragent@cursor.com>
189 lines
5.3 KiB
Python
189 lines
5.3 KiB
Python
"""
|
||
数据侦探模块 - 配置和常量
|
||
|
||
包含文件限制、正则表达式、默认配置等。
|
||
"""
|
||
|
||
import re
|
||
from typing import Dict, Pattern
|
||
|
||
# ==================== 文件限制 ====================
|
||
|
||
MAX_FILE_SIZE_MB = 20 # 最大文件大小(MB)
|
||
MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
|
||
|
||
MAX_TABLE_ROWS = 500 # 单表最大行数
|
||
MAX_TABLES_PER_DOC = 50 # 单文档最大表格数
|
||
|
||
ALLOWED_EXTENSIONS = {".docx"} # MVP 仅支持 .docx
|
||
|
||
|
||
# ==================== 正则表达式 ====================
|
||
|
||
# n (%) 格式匹配,如 "45 (50.0%)" 或 "45(50%)"
|
||
PERCENT_PATTERN = re.compile(
|
||
r"(\d+(?:\.\d+)?)\s*\(\s*(\d+(?:\.\d+)?)\s*%?\s*\)",
|
||
re.IGNORECASE
|
||
)
|
||
|
||
# P 值匹配,如 "P=0.05" 或 "p < 0.001" 或 "P值=0.05"
|
||
PVALUE_PATTERN = re.compile(
|
||
r"[Pp][\s\-值]*[=<>≤≥]\s*(\d+\.?\d*)",
|
||
re.IGNORECASE
|
||
)
|
||
|
||
# 置信区间匹配,如 "95% CI: 1.2-2.5" 或 "(1.2, 2.5)"
|
||
CI_PATTERN = re.compile(
|
||
r"(?:95%?\s*CI[:\s]*)?[\(\[]?\s*(\d+\.?\d*)\s*[-–,]\s*(\d+\.?\d*)\s*[\)\]]?",
|
||
re.IGNORECASE
|
||
)
|
||
|
||
# OR/HR/RR 匹配
|
||
EFFECT_SIZE_PATTERN = re.compile(
|
||
r"(?:OR|HR|RR)\s*[=:]\s*(\d+\.?\d*)",
|
||
re.IGNORECASE
|
||
)
|
||
|
||
# 卡方值匹配,如 "χ²=57.519" 或 "2=57.519" 或 "χ2=57.519"
|
||
CHI_SQUARE_PATTERN = re.compile(
|
||
r"(?:χ[²2]|[χx]2|2)\s*[=:]\s*(\d+\.?\d*)",
|
||
re.IGNORECASE
|
||
)
|
||
|
||
|
||
# ==================== 统计方法检测 ====================
|
||
|
||
METHOD_PATTERNS: Dict[str, Pattern] = {
|
||
"t-test": re.compile(
|
||
r"(t[\s\-]?test|t[\s\-]?检验|student.*test|independent.*sample|独立样本|两样本)",
|
||
re.IGNORECASE
|
||
),
|
||
"chi-square": re.compile(
|
||
r"(chi[\s\-]?square|χ2|χ²|卡方|pearson.*chi|fisher.*exact|fisher精确)",
|
||
re.IGNORECASE
|
||
),
|
||
"anova": re.compile(
|
||
r"(anova|analysis\s+of\s+variance|方差分析|单因素|多因素)",
|
||
re.IGNORECASE
|
||
),
|
||
"logistic": re.compile(
|
||
r"(logistic\s+regression|逻辑回归|二元回归|logit)",
|
||
re.IGNORECASE
|
||
),
|
||
"cox": re.compile(
|
||
r"(cox\s+regression|cox\s+proportional|生存分析|比例风险|kaplan[\s\-]?meier)",
|
||
re.IGNORECASE
|
||
),
|
||
"mann-whitney": re.compile(
|
||
r"(mann[\s\-]?whitney|wilcoxon|秩和检验|非参数)",
|
||
re.IGNORECASE
|
||
),
|
||
"paired-t": re.compile(
|
||
r"(paired[\s\-]?t|配对.*t|before[\s\-]?after)",
|
||
re.IGNORECASE
|
||
),
|
||
}
|
||
|
||
|
||
# ==================== 表格类型检测 ====================
|
||
|
||
# 基线特征表关键词
|
||
BASELINE_KEYWORDS = [
|
||
"baseline", "characteristics", "demographic", "基线", "特征", "人口学"
|
||
]
|
||
|
||
# 结局表关键词
|
||
OUTCOME_KEYWORDS = [
|
||
"outcome", "result", "efficacy", "endpoint", "结局", "疗效", "终点"
|
||
]
|
||
|
||
|
||
# ==================== 容错配置(终审建议) ====================
|
||
|
||
DEFAULT_TOLERANCE_PERCENT = 0.1 # 百分比容错 ±0.1%
|
||
|
||
# P 值容错阈值
|
||
PVALUE_ERROR_THRESHOLD = 0.05 # P 值差异 > 0.05 → Error(严重矛盾)
|
||
PVALUE_WARNING_THRESHOLD = 0.01 # P 值差异 > 0.01 → Warning(可能舍入误差)
|
||
PVALUE_RELATIVE_TOLERANCE = 0.05 # P 值相对误差 ±5%
|
||
|
||
# CI 容错阈值
|
||
CI_RELATIVE_TOLERANCE = 0.02 # CI 端点相对误差 ±2%
|
||
|
||
# 统计量容错
|
||
STAT_RELATIVE_TOLERANCE = 0.05 # t/χ² 值相对误差 ±5%
|
||
|
||
|
||
# ==================== Mean±SD 正则表达式 ====================
|
||
|
||
# Mean ± SD 格式,如 "45.2 ± 12.3" 或 "45.2±12.3" 或 "45.2 (12.3)"
|
||
MEAN_SD_PATTERN = re.compile(
|
||
r"(\d+\.?\d*)\s*[±\+\-]\s*(\d+\.?\d*)",
|
||
re.IGNORECASE
|
||
)
|
||
|
||
# 带括号的 SD 格式,如 "45.2 (12.3)" - 用于某些表格
|
||
MEAN_SD_PAREN_PATTERN = re.compile(
|
||
r"(\d+\.?\d*)\s*\(\s*(\d+\.?\d*)\s*\)(?!\s*%)", # 排除百分比格式
|
||
re.IGNORECASE
|
||
)
|
||
|
||
# CI 格式清洗器(终审建议:处理多种分隔符)
|
||
CI_PATTERNS = [
|
||
# 标准格式: 2.5 (1.1-3.5) 或 2.5 [1.1-3.5]
|
||
re.compile(r"[\(\[]\s*(\d+\.?\d*)\s*[-–—,;]\s*(\d+\.?\d*)\s*[\)\]]", re.IGNORECASE),
|
||
# 带 CI 标签: 95% CI: 1.1-3.5 或 95%CI 1.1 to 3.5
|
||
re.compile(r"95%?\s*CI\s*[:\s]+(\d+\.?\d*)\s*[-–—,;to]+\s*(\d+\.?\d*)", re.IGNORECASE),
|
||
# 简单范围: 1.1-3.5(需要上下文判断)
|
||
re.compile(r"(\d+\.?\d*)\s*[-–—]\s*(\d+\.?\d*)", re.IGNORECASE),
|
||
]
|
||
|
||
|
||
# ==================== 验证函数 ====================
|
||
|
||
def validate_file_size(size_bytes: int) -> tuple[bool, str]:
|
||
"""
|
||
验证文件大小
|
||
|
||
Returns:
|
||
(is_valid, error_message)
|
||
"""
|
||
if size_bytes > MAX_FILE_SIZE_BYTES:
|
||
return False, f"文件大小 ({size_bytes / 1024 / 1024:.1f}MB) 超过限制 ({MAX_FILE_SIZE_MB}MB)"
|
||
return True, ""
|
||
|
||
|
||
def validate_file_extension(filename: str) -> tuple[bool, str]:
|
||
"""
|
||
验证文件扩展名
|
||
|
||
Returns:
|
||
(is_valid, error_message)
|
||
"""
|
||
from pathlib import Path
|
||
ext = Path(filename).suffix.lower()
|
||
|
||
if ext not in ALLOWED_EXTENSIONS:
|
||
if ext == ".doc":
|
||
return False, "暂不支持 .doc 格式,请使用 Word 另存为 .docx 格式后重新上传"
|
||
return False, f"不支持的文件格式: {ext},仅支持 .docx"
|
||
|
||
return True, ""
|
||
|
||
|
||
def detect_methods(text: str) -> list[str]:
|
||
"""
|
||
检测文本中的统计方法(正则优先)
|
||
|
||
Args:
|
||
text: 文档全文
|
||
|
||
Returns:
|
||
检测到的方法列表
|
||
"""
|
||
found = []
|
||
for method_name, pattern in METHOD_PATTERNS.items():
|
||
if pattern.search(text):
|
||
found.append(method_name)
|
||
return found
|