""" 数据侦探模块 - 配置和常量 包含文件限制、正则表达式、默认配置等。 """ import re from typing import Dict, Pattern # ==================== 文件限制 ==================== MAX_FILE_SIZE_MB = 20 # 最大文件大小(MB) MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024 MAX_TABLE_ROWS = 500 # 单表最大行数 MAX_TABLES_PER_DOC = 50 # 单文档最大表格数 ALLOWED_EXTENSIONS = {".docx"} # MVP 仅支持 .docx # ==================== 正则表达式 ==================== # n (%) 格式匹配,如 "45 (50.0%)" 或 "45(50%)" PERCENT_PATTERN = re.compile( r"(\d+(?:\.\d+)?)\s*\(\s*(\d+(?:\.\d+)?)\s*%?\s*\)", re.IGNORECASE ) # P 值匹配,如 "P=0.05" 或 "p < 0.001" 或 "P值=0.05" PVALUE_PATTERN = re.compile( r"[Pp][\s\-值]*[=<>≤≥]\s*(\d+\.?\d*)", re.IGNORECASE ) # 置信区间匹配,如 "95% CI: 1.2-2.5" 或 "(1.2, 2.5)" CI_PATTERN = re.compile( r"(?:95%?\s*CI[:\s]*)?[\(\[]?\s*(\d+\.?\d*)\s*[-–,]\s*(\d+\.?\d*)\s*[\)\]]?", re.IGNORECASE ) # OR/HR/RR 匹配 EFFECT_SIZE_PATTERN = re.compile( r"(?:OR|HR|RR)\s*[=:]\s*(\d+\.?\d*)", re.IGNORECASE ) # 卡方值匹配,如 "χ²=57.519" 或 "2=57.519" 或 "χ2=57.519" CHI_SQUARE_PATTERN = re.compile( r"(?:χ[²2]|[χx]2|2)\s*[=:]\s*(\d+\.?\d*)", re.IGNORECASE ) # ==================== 统计方法检测 ==================== METHOD_PATTERNS: Dict[str, Pattern] = { "t-test": re.compile( r"(t[\s\-]?test|t[\s\-]?检验|student.*test|independent.*sample|独立样本|两样本)", re.IGNORECASE ), "chi-square": re.compile( r"(chi[\s\-]?square|χ2|χ²|卡方|pearson.*chi|fisher.*exact|fisher精确)", re.IGNORECASE ), "anova": re.compile( r"(anova|analysis\s+of\s+variance|方差分析|单因素|多因素)", re.IGNORECASE ), "logistic": re.compile( r"(logistic\s+regression|逻辑回归|二元回归|logit)", re.IGNORECASE ), "cox": re.compile( r"(cox\s+regression|cox\s+proportional|生存分析|比例风险|kaplan[\s\-]?meier)", re.IGNORECASE ), "mann-whitney": re.compile( r"(mann[\s\-]?whitney|wilcoxon|秩和检验|非参数)", re.IGNORECASE ), "paired-t": re.compile( r"(paired[\s\-]?t|配对.*t|before[\s\-]?after)", re.IGNORECASE ), } # ==================== 表格类型检测 ==================== # 基线特征表关键词 BASELINE_KEYWORDS = [ "baseline", "characteristics", "demographic", "基线", "特征", "人口学" ] # 结局表关键词 OUTCOME_KEYWORDS = [ "outcome", "result", "efficacy", "endpoint", "结局", "疗效", "终点" ] # ==================== 容错配置(终审建议) ==================== DEFAULT_TOLERANCE_PERCENT = 0.1 # 百分比容错 ±0.1% # P 值容错阈值 PVALUE_ERROR_THRESHOLD = 0.05 # P 值差异 > 0.05 → Error(严重矛盾) PVALUE_WARNING_THRESHOLD = 0.01 # P 值差异 > 0.01 → Warning(可能舍入误差) PVALUE_RELATIVE_TOLERANCE = 0.05 # P 值相对误差 ±5% # CI 容错阈值 CI_RELATIVE_TOLERANCE = 0.02 # CI 端点相对误差 ±2% # 统计量容错 STAT_RELATIVE_TOLERANCE = 0.05 # t/χ² 值相对误差 ±5% # ==================== Mean±SD 正则表达式 ==================== # Mean ± SD 格式,如 "45.2 ± 12.3" 或 "45.2±12.3" 或 "45.2 (12.3)" MEAN_SD_PATTERN = re.compile( r"(\d+\.?\d*)\s*[±\+\-]\s*(\d+\.?\d*)", re.IGNORECASE ) # 带括号的 SD 格式,如 "45.2 (12.3)" - 用于某些表格 MEAN_SD_PAREN_PATTERN = re.compile( r"(\d+\.?\d*)\s*\(\s*(\d+\.?\d*)\s*\)(?!\s*%)", # 排除百分比格式 re.IGNORECASE ) # CI 格式清洗器(终审建议:处理多种分隔符) CI_PATTERNS = [ # 标准格式: 2.5 (1.1-3.5) 或 2.5 [1.1-3.5] re.compile(r"[\(\[]\s*(\d+\.?\d*)\s*[-–—,;]\s*(\d+\.?\d*)\s*[\)\]]", re.IGNORECASE), # 带 CI 标签: 95% CI: 1.1-3.5 或 95%CI 1.1 to 3.5 re.compile(r"95%?\s*CI\s*[:\s]+(\d+\.?\d*)\s*[-–—,;to]+\s*(\d+\.?\d*)", re.IGNORECASE), # 简单范围: 1.1-3.5(需要上下文判断) re.compile(r"(\d+\.?\d*)\s*[-–—]\s*(\d+\.?\d*)", re.IGNORECASE), ] # ==================== 验证函数 ==================== def validate_file_size(size_bytes: int) -> tuple[bool, str]: """ 验证文件大小 Returns: (is_valid, error_message) """ if size_bytes > MAX_FILE_SIZE_BYTES: return False, f"文件大小 ({size_bytes / 1024 / 1024:.1f}MB) 超过限制 ({MAX_FILE_SIZE_MB}MB)" return True, "" def validate_file_extension(filename: str) -> tuple[bool, str]: """ 验证文件扩展名 Returns: (is_valid, error_message) """ from pathlib import Path ext = Path(filename).suffix.lower() if ext not in ALLOWED_EXTENSIONS: if ext == ".doc": return False, "暂不支持 .doc 格式,请使用 Word 另存为 .docx 格式后重新上传" return False, f"不支持的文件格式: {ext},仅支持 .docx" return True, "" def detect_methods(text: str) -> list[str]: """ 检测文本中的统计方法(正则优先) Args: text: 文档全文 Returns: 检测到的方法列表 """ found = [] for method_name, pattern in METHOD_PATTERNS.items(): if pattern.search(text): found.append(method_name) return found