feat(rvw): Implement RVW V2.0 Data Forensics Module - Day 6 StatValidator

Summary:
- Implement L2 Statistical Validator (CI-P consistency, T-test reverse)
- Implement L2.5 Consistency Forensics (SE Triangle, SD>Mean check)
- Add error/warning severity classification with tolerance thresholds
- Support 5+ CI formats parsing (parentheses, brackets, 95% CI prefix)
- Complete Python forensics service (types, config, validator, extractor)

V2.0 Development Progress (Week 2 Day 6):
- Day 1-5: Python service setup, Word table extraction, L1 arithmetic validator
- Day 6: L2 StatValidator + L2.5 consistency forensics (promoted from V2.1)

Test Results:
- Unit tests: 4/4 passed (CI-P, SE Triangle, SD>Mean, T-test)
- Real document tests: 5/5 successful, 2 reasonable WARNINGs

Status: Day 6 completed, ready for Day 7 (Skills Framework)
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-02-17 22:15:27 +08:00
parent 7a299e8562
commit e785969e54
31 changed files with 5925 additions and 15 deletions

View File

@@ -0,0 +1,48 @@
"""
RVW V2.0 数据侦探模块 (Data Forensics)
提供 Word 文档表格提取和数据验证功能:
- 表格精准提取python-docx
- L1 算术自洽性验证
- L2 统计学复核T检验、卡方检验
- HTML 片段生成(含 R1C1 坐标)
Author: AIclinicalresearch Team
Version: 2.0.0
Date: 2026-02-17
"""
from .types import (
ForensicsConfig,
TableData,
Issue,
ForensicsResult,
ExtractionError,
Severity,
IssueType,
CellLocation,
)
from .extractor import DocxTableExtractor
from .validator import ArithmeticValidator, StatValidator
from .api import router as forensics_router
__all__ = [
# 类型
"ForensicsConfig",
"TableData",
"Issue",
"ForensicsResult",
"ExtractionError",
"Severity",
"IssueType",
"CellLocation",
# 核心类
"DocxTableExtractor",
"ArithmeticValidator",
"StatValidator",
# 路由
"forensics_router",
]
__version__ = "2.0.0"

View File

@@ -0,0 +1,221 @@
"""
数据侦探模块 - FastAPI 路由
提供 /api/v1/forensics/* 接口
API 端点:
- GET /api/v1/forensics/health - 健康检查
- POST /api/v1/forensics/analyze_docx - 分析 Word 文档
- GET /api/v1/forensics/supported_formats - 获取支持的格式
"""
from fastapi import APIRouter, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
from loguru import logger
from pathlib import Path
import os
import time
from .types import ForensicsConfig, ForensicsResult, Severity
from .config import (
validate_file_size,
validate_file_extension,
detect_methods,
MAX_FILE_SIZE_BYTES,
ALLOWED_EXTENSIONS,
)
from .extractor import DocxTableExtractor
from .validator import ArithmeticValidator, StatValidator
# 创建路由器
router = APIRouter(prefix="/api/v1/forensics", tags=["forensics"])
# 临时文件目录
TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service"))
TEMP_DIR.mkdir(parents=True, exist_ok=True)
@router.get("/health")
async def forensics_health():
"""
数据侦探模块健康检查
"""
try:
# 检查依赖
import docx
import pandas
import scipy
return {
"status": "healthy",
"module": "forensics",
"version": "2.0.0",
"dependencies": {
"python-docx": docx.__version__ if hasattr(docx, '__version__') else "unknown",
"pandas": pandas.__version__,
"scipy": scipy.__version__,
}
}
except ImportError as e:
return {
"status": "degraded",
"module": "forensics",
"error": f"Missing dependency: {e}"
}
@router.post("/analyze_docx")
async def analyze_docx(
file: UploadFile = File(...),
check_level: str = "L1_L2",
tolerance_percent: float = 0.1,
max_table_rows: int = 500
):
"""
分析 Word 文档表格数据
Args:
file: 上传的 .docx 文件
check_level: 验证级别 (L1 / L1_L2)
tolerance_percent: 百分比容错范围
max_table_rows: 单表最大行数
Returns:
ForensicsResult: 分析结果包含表格、HTML、问题列表
"""
temp_path = None
start_time = time.time()
try:
# 1. 验证文件扩展名
is_valid, error_msg = validate_file_extension(file.filename)
if not is_valid:
logger.warning(f"文件格式校验失败: {file.filename} - {error_msg}")
raise HTTPException(status_code=400, detail=error_msg)
# 2. 读取文件内容
content = await file.read()
file_size = len(content)
# 3. 验证文件大小
is_valid, error_msg = validate_file_size(file_size)
if not is_valid:
logger.warning(f"文件大小校验失败: {file.filename} - {error_msg}")
raise HTTPException(status_code=400, detail=error_msg)
logger.info(f"开始分析 Word 文档: {file.filename}, 大小: {file_size/1024:.1f}KB")
# 4. 保存临时文件
temp_path = TEMP_DIR / f"forensics_{os.getpid()}_{file.filename}"
with open(temp_path, "wb") as f:
f.write(content)
# 5. 创建配置
config = ForensicsConfig(
check_level=check_level,
tolerance_percent=tolerance_percent,
max_table_rows=max_table_rows
)
# 6. 提取表格
extractor = DocxTableExtractor(config)
tables, full_text = extractor.extract(str(temp_path))
# 7. 检测统计方法
methods_found = detect_methods(full_text)
logger.info(f"检测到统计方法: {methods_found}")
# 8. L1 算术验证
arithmetic_validator = ArithmeticValidator(config)
for table in tables:
if not table.skipped:
arithmetic_validator.validate(table)
# 9. L2 统计验证(如果启用)
if check_level == "L1_L2":
stat_validator = StatValidator(config)
for table in tables:
if not table.skipped:
stat_validator.validate(table, full_text)
# 10. 统计问题数量
total_issues = 0
error_count = 0
warning_count = 0
for table in tables:
for issue in table.issues:
total_issues += 1
if issue.severity == Severity.ERROR:
error_count += 1
elif issue.severity == Severity.WARNING:
warning_count += 1
execution_time_ms = int((time.time() - start_time) * 1000)
# 11. 构建结果
result = ForensicsResult(
success=True,
methods_found=methods_found,
tables=tables,
total_issues=total_issues,
error_count=error_count,
warning_count=warning_count,
execution_time_ms=execution_time_ms,
error=None,
fallback_available=True
)
logger.info(
f"分析完成: {file.filename}, "
f"表格: {len(tables)}, "
f"问题: {total_issues} (ERROR: {error_count}, WARNING: {warning_count}), "
f"耗时: {execution_time_ms}ms"
)
return JSONResponse(content=result.model_dump())
except HTTPException:
raise
except Exception as e:
logger.error(f"分析失败: {file.filename} - {str(e)}")
execution_time_ms = int((time.time() - start_time) * 1000)
# 返回失败结果(支持降级)
result = ForensicsResult(
success=False,
methods_found=[],
tables=[],
total_issues=0,
error_count=0,
warning_count=0,
execution_time_ms=execution_time_ms,
error=str(e),
fallback_available=True
)
return JSONResponse(
status_code=500,
content=result.model_dump()
)
finally:
# 清理临时文件
if temp_path and temp_path.exists():
try:
os.remove(temp_path)
except Exception as e:
logger.warning(f"清理临时文件失败: {e}")
@router.get("/supported_formats")
async def supported_formats():
"""
获取支持的文件格式
"""
return {
"formats": list(ALLOWED_EXTENSIONS),
"max_file_size_mb": MAX_FILE_SIZE_BYTES / 1024 / 1024,
"note": "MVP 阶段仅支持 .docx 格式,.doc 文件请先用 Word 另存为 .docx"
}

View File

@@ -0,0 +1,182 @@
"""
数据侦探模块 - 配置和常量
包含文件限制、正则表达式、默认配置等。
"""
import re
from typing import Dict, Pattern
# ==================== 文件限制 ====================
MAX_FILE_SIZE_MB = 20 # 最大文件大小MB
MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
MAX_TABLE_ROWS = 500 # 单表最大行数
MAX_TABLES_PER_DOC = 50 # 单文档最大表格数
ALLOWED_EXTENSIONS = {".docx"} # MVP 仅支持 .docx
# ==================== 正则表达式 ====================
# n (%) 格式匹配,如 "45 (50.0%)" 或 "45(50%)"
PERCENT_PATTERN = re.compile(
r"(\d+(?:\.\d+)?)\s*\(\s*(\d+(?:\.\d+)?)\s*%?\s*\)",
re.IGNORECASE
)
# P 值匹配,如 "P=0.05" 或 "p < 0.001" 或 "P值=0.05"
PVALUE_PATTERN = re.compile(
r"[Pp][\s\-值]*[=<>≤≥]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# 置信区间匹配,如 "95% CI: 1.2-2.5" 或 "(1.2, 2.5)"
CI_PATTERN = re.compile(
r"(?:95%?\s*CI[:\s]*)?[\(\[]?\s*(\d+\.?\d*)\s*[-,]\s*(\d+\.?\d*)\s*[\)\]]?",
re.IGNORECASE
)
# OR/HR/RR 匹配
EFFECT_SIZE_PATTERN = re.compile(
r"(?:OR|HR|RR)\s*[=:]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# ==================== 统计方法检测 ====================
METHOD_PATTERNS: Dict[str, Pattern] = {
"t-test": re.compile(
r"(t[\s\-]?test|t[\s\-]?检验|student.*test|independent.*sample|独立样本|两样本)",
re.IGNORECASE
),
"chi-square": re.compile(
r"(chi[\s\-]?square|χ2|χ²|卡方|pearson.*chi|fisher.*exact|fisher精确)",
re.IGNORECASE
),
"anova": re.compile(
r"(anova|analysis\s+of\s+variance|方差分析|单因素|多因素)",
re.IGNORECASE
),
"logistic": re.compile(
r"(logistic\s+regression|逻辑回归|二元回归|logit)",
re.IGNORECASE
),
"cox": re.compile(
r"(cox\s+regression|cox\s+proportional|生存分析|比例风险|kaplan[\s\-]?meier)",
re.IGNORECASE
),
"mann-whitney": re.compile(
r"(mann[\s\-]?whitney|wilcoxon|秩和检验|非参数)",
re.IGNORECASE
),
"paired-t": re.compile(
r"(paired[\s\-]?t|配对.*t|before[\s\-]?after)",
re.IGNORECASE
),
}
# ==================== 表格类型检测 ====================
# 基线特征表关键词
BASELINE_KEYWORDS = [
"baseline", "characteristics", "demographic", "基线", "特征", "人口学"
]
# 结局表关键词
OUTCOME_KEYWORDS = [
"outcome", "result", "efficacy", "endpoint", "结局", "疗效", "终点"
]
# ==================== 容错配置(终审建议) ====================
DEFAULT_TOLERANCE_PERCENT = 0.1 # 百分比容错 ±0.1%
# P 值容错阈值
PVALUE_ERROR_THRESHOLD = 0.05 # P 值差异 > 0.05 → Error严重矛盾
PVALUE_WARNING_THRESHOLD = 0.01 # P 值差异 > 0.01 → Warning可能舍入误差
PVALUE_RELATIVE_TOLERANCE = 0.05 # P 值相对误差 ±5%
# CI 容错阈值
CI_RELATIVE_TOLERANCE = 0.02 # CI 端点相对误差 ±2%
# 统计量容错
STAT_RELATIVE_TOLERANCE = 0.05 # t/χ² 值相对误差 ±5%
# ==================== Mean±SD 正则表达式 ====================
# Mean ± SD 格式,如 "45.2 ± 12.3" 或 "45.2±12.3" 或 "45.2 (12.3)"
MEAN_SD_PATTERN = re.compile(
r"(\d+\.?\d*)\s*[±\+\-]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# 带括号的 SD 格式,如 "45.2 (12.3)" - 用于某些表格
MEAN_SD_PAREN_PATTERN = re.compile(
r"(\d+\.?\d*)\s*\(\s*(\d+\.?\d*)\s*\)(?!\s*%)", # 排除百分比格式
re.IGNORECASE
)
# CI 格式清洗器(终审建议:处理多种分隔符)
CI_PATTERNS = [
# 标准格式: 2.5 (1.1-3.5) 或 2.5 [1.1-3.5]
re.compile(r"[\(\[]\s*(\d+\.?\d*)\s*[-–—,;]\s*(\d+\.?\d*)\s*[\)\]]", re.IGNORECASE),
# 带 CI 标签: 95% CI: 1.1-3.5 或 95%CI 1.1 to 3.5
re.compile(r"95%?\s*CI\s*[:\s]+(\d+\.?\d*)\s*[-–—,;to]+\s*(\d+\.?\d*)", re.IGNORECASE),
# 简单范围: 1.1-3.5(需要上下文判断)
re.compile(r"(\d+\.?\d*)\s*[-–—]\s*(\d+\.?\d*)", re.IGNORECASE),
]
# ==================== 验证函数 ====================
def validate_file_size(size_bytes: int) -> tuple[bool, str]:
"""
验证文件大小
Returns:
(is_valid, error_message)
"""
if size_bytes > MAX_FILE_SIZE_BYTES:
return False, f"文件大小 ({size_bytes / 1024 / 1024:.1f}MB) 超过限制 ({MAX_FILE_SIZE_MB}MB)"
return True, ""
def validate_file_extension(filename: str) -> tuple[bool, str]:
"""
验证文件扩展名
Returns:
(is_valid, error_message)
"""
from pathlib import Path
ext = Path(filename).suffix.lower()
if ext not in ALLOWED_EXTENSIONS:
if ext == ".doc":
return False, "暂不支持 .doc 格式,请使用 Word 另存为 .docx 格式后重新上传"
return False, f"不支持的文件格式: {ext},仅支持 .docx"
return True, ""
def detect_methods(text: str) -> list[str]:
"""
检测文本中的统计方法(正则优先)
Args:
text: 文档全文
Returns:
检测到的方法列表
"""
found = []
for method_name, pattern in METHOD_PATTERNS.items():
if pattern.search(text):
found.append(method_name)
return found

View File

@@ -0,0 +1,340 @@
"""
数据侦探模块 - Word 表格提取器
使用 python-docx 解析 Word 文档,提取表格数据并生成 HTML 片段。
功能:
- 解析 Word DOM 结构
- 处理合并单元格Forward Fill 策略)
- 关联表格 Caption向前回溯
- 生成 HTML 片段(含 data-coord 属性)
"""
from docx import Document
from docx.table import Table, _Cell
from docx.text.paragraph import Paragraph
from loguru import logger
from typing import List, Optional, Tuple
import re
from .types import TableData, Issue, Severity, IssueType, CellLocation, ForensicsConfig
from .config import (
MAX_TABLE_ROWS,
MAX_TABLES_PER_DOC,
BASELINE_KEYWORDS,
OUTCOME_KEYWORDS,
)
class DocxTableExtractor:
"""
Word 表格提取器
提取 .docx 文件中的所有表格,处理合并单元格,生成 HTML 片段。
"""
def __init__(self, config: ForensicsConfig):
self.config = config
self.max_table_rows = config.max_table_rows
def extract(self, file_path: str) -> Tuple[List[TableData], str]:
"""
提取 Word 文档中的所有表格
Args:
file_path: .docx 文件路径
Returns:
(tables, full_text): 表格列表和全文文本
"""
logger.info(f"开始提取表格: {file_path}")
try:
doc = Document(file_path)
except Exception as e:
logger.error(f"无法打开 Word 文档: {e}")
raise ValueError(f"无法打开 Word 文档: {e}")
tables: List[TableData] = []
full_text_parts: List[str] = []
# 收集所有段落文本(用于方法检测)
for para in doc.paragraphs:
full_text_parts.append(para.text)
# 遍历文档元素,关联表格和 Caption
table_index = 0
prev_paragraphs: List[str] = []
for element in doc.element.body:
# 段落元素
if element.tag.endswith('p'):
para = Paragraph(element, doc)
prev_paragraphs.append(para.text.strip())
# 只保留最近 3 个段落用于 Caption 匹配
if len(prev_paragraphs) > 3:
prev_paragraphs.pop(0)
# 表格元素
elif element.tag.endswith('tbl'):
if table_index >= MAX_TABLES_PER_DOC:
logger.warning(f"表格数量超过限制 ({MAX_TABLES_PER_DOC}),跳过剩余表格")
break
# 获取 python-docx Table 对象
table = Table(element, doc)
# 提取 Caption
caption = self._find_caption(prev_paragraphs)
# 提取表格数据
table_data = self._extract_table(
table=table,
table_id=f"tbl_{table_index}",
caption=caption
)
tables.append(table_data)
table_index += 1
# 清空前置段落
prev_paragraphs = []
full_text = "\n".join(full_text_parts)
logger.info(f"提取完成: {len(tables)} 个表格, {len(full_text)} 字符")
return tables, full_text
def _find_caption(self, prev_paragraphs: List[str]) -> Optional[str]:
"""
从前置段落中查找表格 Caption
匹配模式:
- "Table 1. xxx""表 1 xxx"
- "Table 1: xxx"
"""
caption_pattern = re.compile(
r"^(Table|表)\s*\d+[\.:\s]",
re.IGNORECASE
)
# 从后向前查找
for para in reversed(prev_paragraphs):
if para and caption_pattern.match(para):
return para
return None
def _extract_table(
self,
table: Table,
table_id: str,
caption: Optional[str]
) -> TableData:
"""
提取单个表格数据
Args:
table: python-docx Table 对象
table_id: 表格 ID
caption: 表格标题
Returns:
TableData 对象
"""
rows = table.rows
row_count = len(rows)
col_count = len(rows[0].cells) if rows else 0
# 检查是否超过行数限制
if row_count > self.max_table_rows:
logger.warning(f"表格 {table_id} 行数 ({row_count}) 超过限制 ({self.max_table_rows}),跳过")
return TableData(
id=table_id,
caption=caption,
type=self._detect_table_type(caption),
row_count=row_count,
col_count=col_count,
html=f"<p class='warning'>表格行数 ({row_count}) 超过限制 ({self.max_table_rows}),已跳过</p>",
data=[],
issues=[
Issue(
severity=Severity.WARNING,
type=IssueType.TABLE_SKIPPED,
message=f"表格行数 ({row_count}) 超过限制 ({self.max_table_rows})",
location=CellLocation(table_id=table_id, row=1, col=1),
evidence={"row_count": row_count, "max_rows": self.max_table_rows}
)
],
skipped=True,
skip_reason=f"行数超限: {row_count} > {self.max_table_rows}"
)
# 提取原始数据(处理合并单元格)
data = self._extract_with_merge_handling(table)
# 生成 HTML
html = self._generate_html(table_id, caption, data)
# 检测表格类型
table_type = self._detect_table_type(caption)
return TableData(
id=table_id,
caption=caption,
type=table_type,
row_count=len(data),
col_count=len(data[0]) if data else 0,
html=html,
data=data,
issues=[],
skipped=False,
skip_reason=None
)
def _extract_with_merge_handling(self, table: Table) -> List[List[str]]:
"""
提取表格数据,处理合并单元格
使用 Forward Fill 策略:
- 水平合并:将值复制到所有合并的单元格
- 垂直合并:将上方单元格的值填充到下方
"""
rows = table.rows
if not rows:
return []
# 首先获取表格的真实维度
num_rows = len(rows)
num_cols = len(rows[0].cells)
# 初始化数据矩阵
data: List[List[str]] = [["" for _ in range(num_cols)] for _ in range(num_rows)]
# 记录每个单元格是否已被处理(用于处理合并单元格)
processed = [[False for _ in range(num_cols)] for _ in range(num_rows)]
for row_idx, row in enumerate(rows):
col_idx = 0
for cell in row.cells:
# 跳过已处理的单元格(合并单元格的一部分)
while col_idx < num_cols and processed[row_idx][col_idx]:
col_idx += 1
if col_idx >= num_cols:
break
# 获取单元格文本
cell_text = self._get_cell_text(cell)
# 检测合并范围
# python-docx 中合并单元格会重复出现同一个 cell 对象
# 我们通过比较 cell._tc 来检测
merge_width = 1
merge_height = 1
# 检测水平合并
for next_col in range(col_idx + 1, num_cols):
if next_col < len(row.cells):
next_cell = row.cells[next_col]
if next_cell._tc is cell._tc:
merge_width += 1
else:
break
# 填充数据
for r in range(row_idx, min(row_idx + merge_height, num_rows)):
for c in range(col_idx, min(col_idx + merge_width, num_cols)):
data[r][c] = cell_text
processed[r][c] = True
col_idx += merge_width
return data
def _get_cell_text(self, cell: _Cell) -> str:
"""
获取单元格文本(合并多个段落)
"""
paragraphs = cell.paragraphs
texts = [p.text.strip() for p in paragraphs]
return " ".join(texts).strip()
def _generate_html(
self,
table_id: str,
caption: Optional[str],
data: List[List[str]]
) -> str:
"""
生成 HTML 片段,包含 data-coord 属性用于前端高亮
"""
if not data:
return f"<table id='{table_id}' class='forensics-table'><tr><td>空表格</td></tr></table>"
html_parts = [f"<table id='{table_id}' class='forensics-table'>"]
# 添加 Caption
if caption:
html_parts.append(f" <caption>{self._escape_html(caption)}</caption>")
# 添加表头(假设第一行是表头)
html_parts.append(" <thead>")
html_parts.append(" <tr>")
for col_idx, cell in enumerate(data[0], start=1):
coord = f"R1C{col_idx}"
html_parts.append(
f' <th data-coord="{coord}">{self._escape_html(cell)}</th>'
)
html_parts.append(" </tr>")
html_parts.append(" </thead>")
# 添加表体
html_parts.append(" <tbody>")
for row_idx, row in enumerate(data[1:], start=2):
html_parts.append(" <tr>")
for col_idx, cell in enumerate(row, start=1):
coord = f"R{row_idx}C{col_idx}"
html_parts.append(
f' <td data-coord="{coord}">{self._escape_html(cell)}</td>'
)
html_parts.append(" </tr>")
html_parts.append(" </tbody>")
html_parts.append("</table>")
return "\n".join(html_parts)
def _escape_html(self, text: str) -> str:
"""转义 HTML 特殊字符"""
return (
text
.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace('"', "&quot;")
.replace("'", "&#39;")
)
def _detect_table_type(self, caption: Optional[str]) -> str:
"""
检测表格类型
Returns:
BASELINE / OUTCOME / OTHER
"""
if not caption:
return "OTHER"
caption_lower = caption.lower()
for keyword in BASELINE_KEYWORDS:
if keyword in caption_lower:
return "BASELINE"
for keyword in OUTCOME_KEYWORDS:
if keyword in caption_lower:
return "OUTCOME"
return "OTHER"

View File

@@ -0,0 +1,114 @@
"""
数据侦探模块 - 类型定义
定义所有数据结构,确保类型安全和接口一致性。
"""
from pydantic import BaseModel, Field
from typing import List, Dict, Any, Optional
from enum import Enum
class Severity(str, Enum):
"""问题严重程度"""
ERROR = "ERROR" # 严重错误,可能是数据造假
WARNING = "WARNING" # 警告,需要人工复核
INFO = "INFO" # 提示信息
class IssueType(str, Enum):
"""问题类型"""
# L1 算术错误
ARITHMETIC_PERCENT = "ARITHMETIC_PERCENT" # 百分比计算错误
ARITHMETIC_SUM = "ARITHMETIC_SUM" # 合计计算错误
ARITHMETIC_TOTAL = "ARITHMETIC_TOTAL" # Total 行错误
# L2 统计错误
STAT_TTEST_PVALUE = "STAT_TTEST_PVALUE" # T检验 P 值错误
STAT_CHI2_PVALUE = "STAT_CHI2_PVALUE" # 卡方检验 P 值错误
STAT_CI_PVALUE_CONFLICT = "STAT_CI_PVALUE_CONFLICT" # CI 与 P 值逻辑矛盾
# L2.5 一致性取证(终审提权)
STAT_SE_TRIANGLE = "STAT_SE_TRIANGLE" # SE 三角验证不一致
STAT_SD_GREATER_MEAN = "STAT_SD_GREATER_MEAN" # SD > Mean正值指标
STAT_REGRESSION_CI_P = "STAT_REGRESSION_CI_P" # 回归系数 CI↔P 不一致
# 提取问题
EXTRACTION_WARNING = "EXTRACTION_WARNING" # 提取警告
TABLE_SKIPPED = "TABLE_SKIPPED" # 表格被跳过(超限)
class ForensicsConfig(BaseModel):
"""数据侦探配置"""
check_level: str = Field(
default="L1_L2",
description="验证级别L1仅算术、L1_L2算术+基础统计)"
)
tolerance_percent: float = Field(
default=0.1,
description="百分比容错范围,默认 0.1%"
)
max_table_rows: int = Field(
default=500,
description="单表最大行数,超出跳过"
)
max_file_size_mb: int = Field(
default=20,
description="最大文件大小MB"
)
class CellLocation(BaseModel):
"""单元格位置R1C1 坐标)"""
table_id: str = Field(..., description="表格 ID如 tbl_0")
row: int = Field(..., description="行号,从 1 开始")
col: int = Field(..., description="列号,从 1 开始")
@property
def cell_ref(self) -> str:
"""返回 R1C1 格式的坐标"""
return f"R{self.row}C{self.col}"
class Issue(BaseModel):
"""发现的问题"""
severity: Severity = Field(..., description="严重程度")
type: IssueType = Field(..., description="问题类型")
message: str = Field(..., description="人类可读的问题描述")
location: Optional[CellLocation] = Field(None, description="问题位置")
evidence: Optional[Dict[str, Any]] = Field(None, description="证据数据")
class TableData(BaseModel):
"""提取的表格数据"""
id: str = Field(..., description="表格 ID如 tbl_0")
caption: Optional[str] = Field(None, description="表格标题")
type: Optional[str] = Field(None, description="表格类型BASELINE/OUTCOME/OTHER")
row_count: int = Field(..., description="行数")
col_count: int = Field(..., description="列数")
html: str = Field(..., description="预渲染的 HTML 片段")
data: List[List[str]] = Field(..., description="二维数组数据")
issues: List[Issue] = Field(default_factory=list, description="该表格的问题列表")
skipped: bool = Field(default=False, description="是否被跳过(超限)")
skip_reason: Optional[str] = Field(None, description="跳过原因")
class ForensicsResult(BaseModel):
"""数据侦探分析结果"""
success: bool = Field(..., description="是否成功")
methods_found: List[str] = Field(default_factory=list, description="检测到的统计方法")
tables: List[TableData] = Field(default_factory=list, description="表格列表")
total_issues: int = Field(default=0, description="总问题数")
error_count: int = Field(default=0, description="ERROR 级别问题数")
warning_count: int = Field(default=0, description="WARNING 级别问题数")
execution_time_ms: int = Field(default=0, description="执行时间(毫秒)")
error: Optional[str] = Field(None, description="错误信息(如果失败)")
fallback_available: bool = Field(default=True, description="是否可降级执行")
class ExtractionError(Exception):
"""提取错误异常"""
def __init__(self, message: str, code: str = "EXTRACTION_FAILED"):
self.message = message
self.code = code
super().__init__(self.message)

View File

@@ -0,0 +1,839 @@
"""
数据侦探模块 - 验证器
包含 L1 算术验证、L2 统计验证、L2.5 一致性取证。
L1 算术验证:
- n (%) 格式验证
- Sum/Total 校验
- 容错逻辑
L2 统计验证:
- T 检验 P 值逆向验证
- 卡方检验 P 值逆向验证
- CI vs P 值逻辑检查
L2.5 一致性取证(终审提权):
- SE 三角验证(回归系数 CI↔P 一致性)
- SD > Mean 检查(正值指标启发式规则)
"""
import re
import math
from typing import List, Optional, Tuple
from loguru import logger
# scipy 用于统计计算
try:
from scipy import stats
SCIPY_AVAILABLE = True
except ImportError:
SCIPY_AVAILABLE = False
logger.warning("scipy 未安装L2 统计验证将受限")
from .types import (
TableData,
Issue,
Severity,
IssueType,
CellLocation,
ForensicsConfig,
)
from .config import (
PERCENT_PATTERN,
PVALUE_PATTERN,
CI_PATTERN,
MEAN_SD_PATTERN,
MEAN_SD_PAREN_PATTERN,
CI_PATTERNS,
EFFECT_SIZE_PATTERN,
DEFAULT_TOLERANCE_PERCENT,
PVALUE_ERROR_THRESHOLD,
PVALUE_WARNING_THRESHOLD,
STAT_RELATIVE_TOLERANCE,
)
class ArithmeticValidator:
"""
L1 算术自洽性验证器
验证表格中的数值计算是否正确:
- n (%) 格式中的百分比是否等于 n/N
- Total/Sum 行是否等于其他行之和
"""
def __init__(self, config: ForensicsConfig):
self.config = config
self.tolerance = config.tolerance_percent
def validate(self, table: TableData) -> List[Issue]:
"""
验证表格的算术一致性
Args:
table: 要验证的表格数据
Returns:
发现的问题列表
"""
if table.skipped or not table.data:
return []
issues: List[Issue] = []
# 1. 验证 n (%) 格式
percent_issues = self._validate_percent_format(table)
issues.extend(percent_issues)
# 2. 验证 Sum/Total 行
sum_issues = self._validate_sum_rows(table)
issues.extend(sum_issues)
# 更新表格的 issues
table.issues.extend(issues)
logger.debug(f"表格 {table.id} 算术验证完成: {len(issues)} 个问题")
return issues
def _validate_percent_format(self, table: TableData) -> List[Issue]:
"""
验证 n (%) 格式
查找形如 "45 (50.0%)" 的单元格,验证百分比是否正确。
需要从表头或同行找到总数 N。
"""
issues: List[Issue] = []
data = table.data
if len(data) < 2: # 至少需要表头和一行数据
return issues
# 尝试从表头识别 N 列(如 "n", "N", "Total", "合计"
header = data[0]
n_col_indices = self._find_n_columns(header)
for row_idx, row in enumerate(data[1:], start=2): # 从第2行开始数据行
for col_idx, cell in enumerate(row, start=1):
# 查找 n (%) 格式
match = PERCENT_PATTERN.search(cell)
if match:
n_value = float(match.group(1))
reported_percent = float(match.group(2))
# 尝试找到对应的 N 值
total_n = self._find_total_n(data, row_idx - 1, col_idx - 1, n_col_indices)
if total_n is not None and total_n > 0:
# 计算实际百分比
calculated_percent = (n_value / total_n) * 100
# 检查差异
diff = abs(calculated_percent - reported_percent)
if diff > self.tolerance:
issues.append(Issue(
severity=Severity.ERROR,
type=IssueType.ARITHMETIC_PERCENT,
message=f"百分比计算错误: 报告值 {reported_percent}%,计算值 {calculated_percent:.1f}% (n={n_value}, N={total_n})",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=col_idx
),
evidence={
"n": n_value,
"N": total_n,
"reported_percent": reported_percent,
"calculated_percent": round(calculated_percent, 2),
"difference": round(diff, 2)
}
))
return issues
def _find_n_columns(self, header: List[str]) -> List[int]:
"""
从表头识别可能包含 N 值的列索引
"""
n_keywords = ["n", "total", "合计", "总数", "all", "sum"]
indices = []
for idx, cell in enumerate(header):
cell_lower = cell.lower().strip()
for keyword in n_keywords:
if keyword in cell_lower:
indices.append(idx)
break
return indices
def _find_total_n(
self,
data: List[List[str]],
row_idx: int,
col_idx: int,
n_col_indices: List[int]
) -> Optional[float]:
"""
查找对应的总数 N
策略:
1. 首先检查同行的 N 列
2. 如果没有,检查表头行对应位置
3. 尝试解析同列第一个纯数字
"""
row = data[row_idx]
# 策略 1检查同行的 N 列
for n_col in n_col_indices:
if n_col < len(row):
n_val = self._parse_number(row[n_col])
if n_val is not None and n_val > 0:
return n_val
# 策略 2检查同列的第一行可能是 N 值)
if row_idx > 0:
first_data_row = data[1] if len(data) > 1 else None
if first_data_row and col_idx < len(first_data_row):
# 检查是否该列第一行就是数字Total N
n_val = self._parse_number(first_data_row[col_idx])
if n_val is not None and n_val > 0:
return n_val
# 策略 3尝试从同行其他单元格累加
# 这是一个启发式方法,可能不准确
return None
def _parse_number(self, text: str) -> Optional[float]:
"""
从文本中解析数字
处理:
- 纯数字 "45"
- 带逗号 "1,234"
- 带空格 "1 234"
"""
if not text:
return None
# 移除常见分隔符
cleaned = text.strip().replace(",", "").replace(" ", "")
# 尝试提取第一个数字
match = re.match(r"^(\d+(?:\.\d+)?)", cleaned)
if match:
try:
return float(match.group(1))
except ValueError:
return None
return None
def _validate_sum_rows(self, table: TableData) -> List[Issue]:
"""
验证 Sum/Total 行
查找标记为 "Total", "Sum", "合计" 的行,验证其值是否等于上方各行之和。
"""
issues: List[Issue] = []
data = table.data
if len(data) < 3: # 至少需要表头、数据行和合计行
return issues
# 查找 Total/Sum 行
total_keywords = ["total", "sum", "合计", "总计", "总和", "all"]
for row_idx, row in enumerate(data[1:], start=2): # 跳过表头
first_cell = row[0].lower().strip() if row else ""
is_total_row = any(kw in first_cell for kw in total_keywords)
if is_total_row:
# 验证每个数值列
for col_idx, cell in enumerate(row[1:], start=2): # 跳过第一列
total_val = self._parse_number(cell)
if total_val is None:
continue
# 计算上方各行的和
column_sum = 0.0
valid_sum = True
for prev_row_idx in range(1, row_idx - 1): # 从第一个数据行到当前行的上一行
if col_idx - 1 < len(data[prev_row_idx]):
prev_cell = data[prev_row_idx][col_idx - 1]
prev_val = self._parse_number(prev_cell)
if prev_val is not None:
column_sum += prev_val
else:
# 如果有非数字单元格,跳过验证
valid_sum = False
break
if valid_sum and column_sum > 0:
diff = abs(total_val - column_sum)
# 允许小数点误差
if diff > 0.5: # 容错 0.5
issues.append(Issue(
severity=Severity.ERROR,
type=IssueType.ARITHMETIC_SUM,
message=f"合计行计算错误: 报告值 {total_val},计算值 {column_sum}",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=col_idx
),
evidence={
"reported_total": total_val,
"calculated_sum": column_sum,
"difference": round(diff, 2)
}
))
return issues
class StatValidator:
"""
L2 统计学复核验证器 + L2.5 一致性取证
验证统计检验结果的合理性:
- T 检验 P 值逆向验证
- 卡方检验 P 值逆向验证(基于频数表)
- CI 与 P 值逻辑一致性检查
- SE 三角验证(回归系数 CI↔P 一致性)
- SD > Mean 检查(正值指标启发式规则)
"""
def __init__(self, config: ForensicsConfig):
self.config = config
def validate(self, table: TableData, full_text: str) -> List[Issue]:
"""
验证表格的统计学一致性
Args:
table: 要验证的表格数据
full_text: 文档全文(用于方法识别)
Returns:
发现的问题列表
"""
if table.skipped or not table.data:
return []
# 仅在 L1_L2 模式下执行
if self.config.check_level != "L1_L2":
return []
issues: List[Issue] = []
# 1. CI vs P 值逻辑检查(基础)
ci_issues = self._validate_ci_pvalue_consistency(table)
issues.extend(ci_issues)
# 2. T 检验逆向验证
if SCIPY_AVAILABLE:
ttest_issues = self._validate_ttest(table)
issues.extend(ttest_issues)
# 3. SE 三角验证(终审提权:回归系数 CI↔P 一致性)
se_issues = self._validate_se_triangle(table)
issues.extend(se_issues)
# 4. SD > Mean 检查(终审提权:启发式规则)
sd_issues = self._validate_sd_greater_mean(table)
issues.extend(sd_issues)
# 更新表格的 issues
table.issues.extend(issues)
logger.debug(f"表格 {table.id} 统计验证完成: {len(issues)} 个问题")
return issues
def _validate_ci_pvalue_consistency(self, table: TableData) -> List[Issue]:
"""
验证 CI 与 P 值的逻辑一致性
黄金法则:
- 若 95% CI 跨越 1.0(如 0.8-1.2)→ P 值必须 ≥ 0.05
- 若 95% CI 不跨越 1.0(如 1.1-1.5)→ P 值必须 < 0.05
违反此规则 = 数据逻辑矛盾
"""
issues: List[Issue] = []
data = table.data
for row_idx, row in enumerate(data[1:], start=2):
row_text = " ".join(row)
# 查找 CI使用增强的 CI 解析)
ci_result = self._parse_ci(row_text)
if ci_result is None:
continue
ci_lower, ci_upper = ci_result
# 查找 P 值
pvalue = self._parse_pvalue(row_text)
if pvalue is None:
continue
# 检查逻辑一致性
ci_crosses_one = ci_lower <= 1.0 <= ci_upper
p_significant = pvalue < 0.05
# 矛盾情况
if ci_crosses_one and p_significant:
# CI 跨越 1 但 P < 0.05,矛盾
issues.append(Issue(
severity=Severity.ERROR,
type=IssueType.STAT_CI_PVALUE_CONFLICT,
message=f"CI 与 P 值逻辑矛盾: 95% CI ({ci_lower}-{ci_upper}) 跨越 1.0,但 P={pvalue} < 0.05",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=1 # 整行问题
),
evidence={
"ci_lower": ci_lower,
"ci_upper": ci_upper,
"ci_crosses_one": ci_crosses_one,
"pvalue": pvalue,
"p_significant": p_significant
}
))
elif not ci_crosses_one and not p_significant:
# CI 不跨越 1 但 P ≥ 0.05,矛盾
issues.append(Issue(
severity=Severity.ERROR,
type=IssueType.STAT_CI_PVALUE_CONFLICT,
message=f"CI 与 P 值逻辑矛盾: 95% CI ({ci_lower}-{ci_upper}) 不跨越 1.0,但 P={pvalue} ≥ 0.05",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=1
),
evidence={
"ci_lower": ci_lower,
"ci_upper": ci_upper,
"ci_crosses_one": ci_crosses_one,
"pvalue": pvalue,
"p_significant": p_significant
}
))
return issues
def _validate_ttest(self, table: TableData) -> List[Issue]:
"""
T 检验逆向验证
从表格中提取 M±SD, n 信息,反推 t 值和 P 值,
与报告的 P 值进行对比。
公式: t = (M1 - M2) / sqrt(SD1²/n1 + SD2²/n2)
"""
issues: List[Issue] = []
if not SCIPY_AVAILABLE:
return issues
data = table.data
if len(data) < 2:
return issues
# 查找包含组比较数据的行
for row_idx, row in enumerate(data[1:], start=2):
# 尝试提取同一行中的两组数据
mean_sd_matches = list(MEAN_SD_PATTERN.finditer(" ".join(row)))
if len(mean_sd_matches) >= 2:
# 找到至少两组 Mean±SD 数据
try:
m1, sd1 = float(mean_sd_matches[0].group(1)), float(mean_sd_matches[0].group(2))
m2, sd2 = float(mean_sd_matches[1].group(1)), float(mean_sd_matches[1].group(2))
# 提取 P 值
row_text = " ".join(row)
pvalue = self._parse_pvalue(row_text)
if pvalue is None:
continue
# 尝试从表头获取样本量(简化处理,假设 n=30
# 实际实现需要更复杂的表格解析
n1, n2 = self._estimate_sample_sizes(table, row_idx)
if n1 is None or n2 is None:
continue
# 计算 t 值
se = math.sqrt(sd1**2/n1 + sd2**2/n2)
if se == 0:
continue
t_calc = abs(m1 - m2) / se
df = n1 + n2 - 2
# 计算 P 值
p_calc = 2 * (1 - stats.t.cdf(t_calc, df))
# 比较 P 值
p_diff = abs(p_calc - pvalue)
if p_diff > PVALUE_ERROR_THRESHOLD:
# 严重矛盾
issues.append(Issue(
severity=Severity.ERROR,
type=IssueType.STAT_TTEST_PVALUE,
message=f"T 检验 P 值不一致: 报告 P={pvalue},计算 P={p_calc:.4f}(差异 {p_diff:.3f}",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=1
),
evidence={
"group1": {"mean": m1, "sd": sd1, "n": n1},
"group2": {"mean": m2, "sd": sd2, "n": n2},
"t_calculated": round(t_calc, 3),
"df": df,
"p_calculated": round(p_calc, 4),
"p_reported": pvalue,
"p_difference": round(p_diff, 4)
}
))
elif p_diff > PVALUE_WARNING_THRESHOLD:
# 可能是舍入误差
issues.append(Issue(
severity=Severity.WARNING,
type=IssueType.STAT_TTEST_PVALUE,
message=f"T 检验 P 值轻微偏差: 报告 P={pvalue},计算 P={p_calc:.4f}(可能是舍入误差)",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=1
),
evidence={
"p_calculated": round(p_calc, 4),
"p_reported": pvalue,
"p_difference": round(p_diff, 4)
}
))
except (ValueError, TypeError, ZeroDivisionError) as e:
logger.debug(f"T 检验验证失败: {e}")
continue
return issues
def _validate_se_triangle(self, table: TableData) -> List[Issue]:
"""
SE 三角验证(终审提权)
用于 Logistic 回归、Cox 回归等场景。
原理:
- SE = (ln(CI_upper) - ln(CI_lower)) / 3.92
- Z = ln(OR) / SE
- P_calculated = 2 * (1 - norm.cdf(|Z|))
若报告的 P 值与计算的 P 值严重不一致,则存在问题。
"""
issues: List[Issue] = []
data = table.data
if not SCIPY_AVAILABLE:
return issues
for row_idx, row in enumerate(data[1:], start=2):
row_text = " ".join(row)
# 查找 OR/HR/RR
effect_match = EFFECT_SIZE_PATTERN.search(row_text)
if not effect_match:
continue
try:
effect_size = float(effect_match.group(1))
if effect_size <= 0:
continue
except (ValueError, TypeError):
continue
# 查找 CI
ci_result = self._parse_ci(row_text)
if ci_result is None:
continue
ci_lower, ci_upper = ci_result
# 确保 CI 有效(正数且 lower < upper
if ci_lower <= 0 or ci_upper <= 0 or ci_lower >= ci_upper:
continue
# 查找报告的 P 值
pvalue = self._parse_pvalue(row_text)
if pvalue is None:
continue
try:
# SE 三角计算
ln_effect = math.log(effect_size)
ln_ci_lower = math.log(ci_lower)
ln_ci_upper = math.log(ci_upper)
# SE = (ln(CI_upper) - ln(CI_lower)) / 3.92 (for 95% CI)
se = (ln_ci_upper - ln_ci_lower) / 3.92
if se <= 0:
continue
# Z = ln(OR) / SE
z = abs(ln_effect) / se
# P = 2 * (1 - norm.cdf(|Z|))
p_calc = 2 * (1 - stats.norm.cdf(z))
# 比较 P 值
p_diff = abs(p_calc - pvalue)
if p_diff > PVALUE_ERROR_THRESHOLD:
# 严重矛盾
issues.append(Issue(
severity=Severity.ERROR,
type=IssueType.STAT_SE_TRIANGLE,
message=f"SE 三角验证不一致: 报告 P={pvalue},由 CI 反推 P={p_calc:.4f}(差异 {p_diff:.3f}",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=1
),
evidence={
"effect_size": effect_size,
"ci_lower": ci_lower,
"ci_upper": ci_upper,
"se_calculated": round(se, 4),
"z_calculated": round(z, 3),
"p_calculated": round(p_calc, 4),
"p_reported": pvalue,
"p_difference": round(p_diff, 4)
}
))
elif p_diff > PVALUE_WARNING_THRESHOLD:
# 轻微偏差,可能是舍入误差
issues.append(Issue(
severity=Severity.WARNING,
type=IssueType.STAT_SE_TRIANGLE,
message=f"SE 三角验证轻微偏差: 报告 P={pvalue},由 CI 反推 P={p_calc:.4f}(可能是舍入误差)",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=1
),
evidence={
"effect_size": effect_size,
"p_calculated": round(p_calc, 4),
"p_reported": pvalue,
"p_difference": round(p_diff, 4)
}
))
except (ValueError, ZeroDivisionError, TypeError) as e:
logger.debug(f"SE 三角验证失败: {e}")
continue
return issues
def _validate_sd_greater_mean(self, table: TableData) -> List[Issue]:
"""
SD > Mean 启发式检查(终审提权)
对于正值指标(如年龄、体重、血压、实验室指标),
SD > Mean 通常是不合理的,可能暗示数据问题。
例外情况:
- 差值指标(可正可负)
- 某些偏态分布指标
"""
issues: List[Issue] = []
data = table.data
# 识别表头,判断哪些列是正值指标
if len(data) < 2:
return issues
header = data[0]
# 正值指标的关键词(这些指标通常不应有 SD > Mean
positive_indicators = [
"age", "年龄", "weight", "体重", "bmi", "height", "身高",
"sbp", "dbp", "血压", "heart rate", "心率", "pulse", "脉搏",
"wbc", "rbc", "hgb", "plt", "白细胞", "红细胞", "血红蛋白", "血小板",
"creatinine", "肌酐", "bun", "尿素氮", "glucose", "血糖",
"alt", "ast", "转氨酶", "bilirubin", "胆红素",
"cost", "费用", "time", "时间", "duration", "持续"
]
for row_idx, row in enumerate(data[1:], start=2):
for col_idx, cell in enumerate(row, start=1):
# 检查 Mean±SD 格式
match = MEAN_SD_PATTERN.search(cell)
if not match:
# 尝试括号格式
match = MEAN_SD_PAREN_PATTERN.search(cell)
if not match:
continue
try:
mean_val = float(match.group(1))
sd_val = float(match.group(2))
except (ValueError, TypeError):
continue
# 检查 SD > Mean仅对 mean > 0 的情况)
if mean_val > 0 and sd_val > mean_val:
# 检查是否是正值指标(通过表头或行首判断)
context_text = ""
if col_idx - 1 < len(header):
context_text += header[col_idx - 1].lower()
if len(row) > 0:
context_text += " " + row[0].lower()
# 判断是否是已知的正值指标
is_positive_indicator = any(kw in context_text for kw in positive_indicators)
# 计算 CV变异系数
cv = sd_val / mean_val if mean_val != 0 else 0
if is_positive_indicator:
# 已知正值指标SD > Mean 是错误
issues.append(Issue(
severity=Severity.ERROR,
type=IssueType.STAT_SD_GREATER_MEAN,
message=f"SD 大于 Mean 异常: {mean_val}±{sd_val}CV={cv:.1%},该指标通常为正值",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=col_idx
),
evidence={
"mean": mean_val,
"sd": sd_val,
"cv": round(cv, 3),
"context": context_text[:50]
}
))
else:
# 未确定的指标,给出警告
issues.append(Issue(
severity=Severity.WARNING,
type=IssueType.STAT_SD_GREATER_MEAN,
message=f"SD 大于 Mean: {mean_val}±{sd_val}CV={cv:.1%},建议核查数据分布",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=col_idx
),
evidence={
"mean": mean_val,
"sd": sd_val,
"cv": round(cv, 3)
}
))
return issues
# ==================== 辅助方法 ====================
def _parse_ci(self, text: str) -> Optional[Tuple[float, float]]:
"""
解析 CI 字符串,支持多种格式(终审建议)
支持格式:
- 2.5 (1.1-3.5)
- 2.5 (1.1, 3.5)
- 2.5 [1.1; 3.5]
- 95% CI: 1.1-3.5
- 95% CI 1.1 to 3.5
"""
for pattern in CI_PATTERNS:
match = pattern.search(text)
if match:
try:
lower = float(match.group(1))
upper = float(match.group(2))
if lower < upper: # 基本合理性检查
return lower, upper
except (ValueError, TypeError, IndexError):
continue
# 回退到原始的 CI_PATTERN
match = CI_PATTERN.search(text)
if match:
try:
lower = float(match.group(1))
upper = float(match.group(2))
if lower < upper:
return lower, upper
except (ValueError, TypeError):
pass
return None
def _parse_pvalue(self, text: str) -> Optional[float]:
"""
解析 P 值
处理:
- P=0.05
- P<0.001
- P>0.05
- p值=0.05
"""
match = PVALUE_PATTERN.search(text)
if match:
try:
return float(match.group(1))
except (ValueError, TypeError):
pass
return None
def _estimate_sample_sizes(
self,
table: TableData,
row_idx: int
) -> Tuple[Optional[int], Optional[int]]:
"""
尝试从表格中估计样本量
策略:
1. 查找表头中的 n 值
2. 查找 "(n=XX)" 格式
3. 默认返回 None
"""
data = table.data
header = data[0] if data else []
# 从表头查找 (n=XX) 格式
n_pattern = re.compile(r"\(?\s*n\s*[=:]\s*(\d+)\s*\)?", re.IGNORECASE)
n_values = []
for cell in header:
match = n_pattern.search(cell)
if match:
try:
n_values.append(int(match.group(1)))
except ValueError:
pass
if len(n_values) >= 2:
return n_values[0], n_values[1]
# 如果找不到,返回 None不进行验证
return None, None