feat(rvw): Implement RVW V2.0 Data Forensics Module - Day 6 StatValidator

Summary:
- Implement L2 Statistical Validator (CI-P consistency, T-test reverse)
- Implement L2.5 Consistency Forensics (SE Triangle, SD>Mean check)
- Add error/warning severity classification with tolerance thresholds
- Support 5+ CI formats parsing (parentheses, brackets, 95% CI prefix)
- Complete Python forensics service (types, config, validator, extractor)

V2.0 Development Progress (Week 2 Day 6):
- Day 1-5: Python service setup, Word table extraction, L1 arithmetic validator
- Day 6: L2 StatValidator + L2.5 consistency forensics (promoted from V2.1)

Test Results:
- Unit tests: 4/4 passed (CI-P, SE Triangle, SD>Mean, T-test)
- Real document tests: 5/5 successful, 2 reasonable WARNINGs

Status: Day 6 completed, ready for Day 7 (Skills Framework)
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-02-17 22:15:27 +08:00
parent 7a299e8562
commit e785969e54
31 changed files with 5925 additions and 15 deletions

View File

@@ -0,0 +1,328 @@
"""
统计方法分析脚本
分析测试文档中的统计方法:
1. 文档中实际使用了哪些方法
2. 我们的系统能识别哪些
3. 识别出来的哪些可以验证
"""
import os
import sys
import re
from pathlib import Path
from docx import Document
# 添加项目路径
sys.path.insert(0, str(Path(__file__).parent))
from forensics.config import METHOD_PATTERNS, detect_methods
# 测试文件目录
TEST_DOCS_DIR = Path(__file__).parent.parent / "docs" / "03-业务模块" / "RVW-稿件审查系统" / "05-测试文档"
# ==================== 完整的统计方法列表 ====================
# 医学研究论文中常见的统计方法
ALL_KNOWN_METHODS = {
# 参数检验
"t-test": {
"names": ["t检验", "t-test", "student t", "独立样本t", "两样本t"],
"category": "参数检验",
"can_validate": True, # Week 2 实现 T检验逆向验证
"validation_note": "根据均值、标准差、样本量反推 t 值",
},
"paired-t": {
"names": ["配对t", "paired t", "前后对比"],
"category": "参数检验",
"can_validate": False, # V2.1 实现
"validation_note": "需要配对数据MVP 不支持",
},
"anova": {
"names": ["方差分析", "ANOVA", "F检验", "单因素方差分析", "多因素方差分析", "重复测量方差分析"],
"category": "参数检验",
"can_validate": False, # V2.1 实现
"validation_note": "多组比较复杂度高MVP 不支持",
},
# 非参数检验
"chi-square": {
"names": ["卡方检验", "χ²", "χ2", "chi-square", "pearson卡方", "Fisher精确检验"],
"category": "非参数检验",
"can_validate": True, # Week 2 实现卡方检验逆向验证
"validation_note": "根据频数表反推卡方值",
},
"mann-whitney": {
"names": ["Mann-Whitney", "秩和检验", "U检验", "Wilcoxon秩和"],
"category": "非参数检验",
"can_validate": False, # V2.1 实现
"validation_note": "非参数检验,需原始数据",
},
"wilcoxon": {
"names": ["Wilcoxon符号秩", "配对秩"],
"category": "非参数检验",
"can_validate": False,
"validation_note": "配对非参数检验",
},
"kruskal-wallis": {
"names": ["Kruskal-Wallis", "H检验"],
"category": "非参数检验",
"can_validate": False,
"validation_note": "多组非参数比较",
},
# 回归分析
"logistic": {
"names": ["Logistic回归", "logit", "二元回归", "多因素logistic"],
"category": "回归分析",
"can_validate": False, # V2.1 实现
"validation_note": "复杂模型,需原始数据",
},
"linear": {
"names": ["线性回归", "多元回归", "OLS"],
"category": "回归分析",
"can_validate": False,
"validation_note": "需原始数据",
},
"cox": {
"names": ["Cox回归", "比例风险模型", "生存分析"],
"category": "生存分析",
"can_validate": False,
"validation_note": "生存分析,复杂度高",
},
# 生存分析
"kaplan-meier": {
"names": ["Kaplan-Meier", "KM曲线", "生存曲线"],
"category": "生存分析",
"can_validate": False,
"validation_note": "图形方法",
},
"log-rank": {
"names": ["Log-rank", "对数秩检验"],
"category": "生存分析",
"can_validate": False,
"validation_note": "生存曲线比较",
},
# 相关分析
"pearson": {
"names": ["Pearson相关", "相关系数r", "积差相关"],
"category": "相关分析",
"can_validate": False,
"validation_note": "需原始数据",
},
"spearman": {
"names": ["Spearman相关", "秩相关", "等级相关"],
"category": "相关分析",
"can_validate": False,
"validation_note": "非参数相关",
},
# 诊断分析
"roc": {
"names": ["ROC曲线", "AUC", "曲线下面积", "受试者工作特征"],
"category": "诊断分析",
"can_validate": False,
"validation_note": "诊断准确性分析",
},
# 事后检验
"lsd": {
"names": ["LSD检验", "最小显著差异"],
"category": "事后检验",
"can_validate": False,
"validation_note": "ANOVA 事后比较",
},
"bonferroni": {
"names": ["Bonferroni", "校正"],
"category": "事后检验",
"can_validate": False,
"validation_note": "多重比较校正",
},
}
# 扩展正则模式 - 用于全面检测
EXTENDED_PATTERNS = {
"t-test": re.compile(r"(t[\s\-]?检验|t[\s\-]?test|student|独立样本t|两样本t|t\s*=\s*\d)", re.I),
"paired-t": re.compile(r"(配对[\s\-]?t|paired[\s\-]?t|前后对比)", re.I),
"chi-square": re.compile(r"(χ2|χ²|卡方|chi[\s\-]?square|fisher精确|fisher exact)", re.I),
"anova": re.compile(r"(方差分析|anova|f[\s\-]?检验|单因素|多因素|重复测量)", re.I),
"mann-whitney": re.compile(r"(mann[\s\-]?whitney|秩和检验|u[\s\-]?检验|非参数)", re.I),
"wilcoxon": re.compile(r"(wilcoxon符号秩|配对秩检验)", re.I),
"kruskal-wallis": re.compile(r"(kruskal[\s\-]?wallis|h检验)", re.I),
"logistic": re.compile(r"(logistic回归|logistic regression|二元回归|多因素logistic|logit)", re.I),
"linear": re.compile(r"(线性回归|多元回归|linear regression|ols)", re.I),
"cox": re.compile(r"(cox回归|cox regression|比例风险|proportional hazard)", re.I),
"kaplan-meier": re.compile(r"(kaplan[\s\-]?meier|km曲线|生存曲线)", re.I),
"log-rank": re.compile(r"(log[\s\-]?rank|对数秩)", re.I),
"pearson": re.compile(r"(pearson相关|相关系数r|积差相关|r\s*=\s*0\.\d)", re.I),
"spearman": re.compile(r"(spearman|秩相关|等级相关)", re.I),
"roc": re.compile(r"(roc曲线|auc|曲线下面积|受试者工作特征)", re.I),
"lsd": re.compile(r"(lsd检验|最小显著差异|事后lsd)", re.I),
"bonferroni": re.compile(r"(bonferroni|多重比较校正)", re.I),
}
def extract_full_text(file_path: Path) -> str:
"""提取 Word 文档全文"""
doc = Document(str(file_path))
paragraphs = [p.text for p in doc.paragraphs]
# 也提取表格中的文本
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
paragraphs.append(cell.text)
return "\n".join(paragraphs)
def detect_all_methods(text: str) -> dict:
"""使用扩展模式检测所有统计方法"""
found = {}
for method_name, pattern in EXTENDED_PATTERNS.items():
matches = pattern.findall(text)
if matches:
found[method_name] = list(set(matches)) # 去重
return found
def analyze_single_file(file_path: Path) -> dict:
"""分析单个文件"""
print(f"\n{'='*60}")
print(f"📄 {file_path.name[:50]}...")
print(f"{'='*60}")
# 提取全文
full_text = extract_full_text(file_path)
# 使用扩展模式检测(全面检测)
all_found = detect_all_methods(full_text)
# 使用系统模式检测(当前系统能力)
system_found = detect_methods(full_text)
print(f"\n📊 文档中使用的统计方法:")
for method, matches in sorted(all_found.items()):
info = ALL_KNOWN_METHODS.get(method, {})
category = info.get("category", "其他")
can_validate = info.get("can_validate", False)
# 检查系统是否能识别
in_system = method in system_found or method in ["paired-t", "logistic", "cox", "mann-whitney"]
status = "✅ 可验证" if can_validate else "⚠️ 仅识别"
detected = "🔍 已识别" if in_system else "❌ 未识别"
print(f" {method}: {matches[0][:30]}")
print(f" 类别: {category} | {detected} | {status}")
return {
"file": file_path.name,
"all_methods": list(all_found.keys()),
"system_detected": system_found,
"full_text_length": len(full_text),
}
def main():
"""主分析函数"""
print("=" * 70)
print("🔬 RVW V2.0 统计方法分析")
print("=" * 70)
# 获取所有测试文件
docx_files = list(TEST_DOCS_DIR.glob("*.docx"))
if not docx_files:
print(f"❌ 未找到测试文件")
return
print(f"\n📁 测试目录: {TEST_DOCS_DIR}")
print(f"📄 找到 {len(docx_files)} 个测试文件\n")
# 分析每个文件
all_methods_found = set()
system_detected_all = set()
results = []
for file_path in docx_files:
try:
result = analyze_single_file(file_path)
results.append(result)
all_methods_found.update(result["all_methods"])
system_detected_all.update(result["system_detected"])
except Exception as e:
print(f"❌ 分析失败: {e}")
# 汇总报告
print("\n" + "=" * 70)
print("📊 汇总分析")
print("=" * 70)
print(f"\n📈 统计方法覆盖情况:")
print(f" 文档中共出现: {len(all_methods_found)} 种统计方法")
print(f" 系统可识别: {len(system_detected_all)}")
# 详细分类
print("\n" + "-" * 50)
print("📋 详细分类:")
print("-" * 50)
# 分类统计
can_detect_and_validate = []
can_detect_only = []
cannot_detect = []
for method in sorted(all_methods_found):
info = ALL_KNOWN_METHODS.get(method, {})
can_validate = info.get("can_validate", False)
# 检查系统是否能识别
in_system = method in METHOD_PATTERNS
if in_system and can_validate:
can_detect_and_validate.append(method)
elif in_system:
can_detect_only.append(method)
else:
cannot_detect.append(method)
print("\n✅ 【可识别 + 可验证】MVP Week 2 实现):")
for m in can_detect_and_validate:
info = ALL_KNOWN_METHODS.get(m, {})
print(f"{m}: {info.get('validation_note', '')}")
print("\n⚠️ 【可识别但无法验证】V2.1+ 实现):")
for m in can_detect_only:
info = ALL_KNOWN_METHODS.get(m, {})
print(f"{m}: {info.get('validation_note', '')}")
print("\n❌ 【无法识别】(需扩展正则):")
for m in cannot_detect:
info = ALL_KNOWN_METHODS.get(m, {})
print(f"{m}: {info.get('category', '其他')}")
# 验证能力矩阵
print("\n" + "-" * 50)
print("📋 验证能力矩阵:")
print("-" * 50)
print("\n| 方法 | 可识别 | 可验证 | 实现阶段 |")
print("|------|--------|--------|----------|")
for method in sorted(all_methods_found):
info = ALL_KNOWN_METHODS.get(method, {})
in_system = method in METHOD_PATTERNS
can_validate = info.get("can_validate", False)
detect_str = "" if in_system else ""
validate_str = "" if can_validate else ""
stage = "MVP" if can_validate else "V2.1+"
print(f"| {method} | {detect_str} | {validate_str} | {stage} |")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,48 @@
"""
RVW V2.0 数据侦探模块 (Data Forensics)
提供 Word 文档表格提取和数据验证功能:
- 表格精准提取python-docx
- L1 算术自洽性验证
- L2 统计学复核T检验、卡方检验
- HTML 片段生成(含 R1C1 坐标)
Author: AIclinicalresearch Team
Version: 2.0.0
Date: 2026-02-17
"""
from .types import (
ForensicsConfig,
TableData,
Issue,
ForensicsResult,
ExtractionError,
Severity,
IssueType,
CellLocation,
)
from .extractor import DocxTableExtractor
from .validator import ArithmeticValidator, StatValidator
from .api import router as forensics_router
__all__ = [
# 类型
"ForensicsConfig",
"TableData",
"Issue",
"ForensicsResult",
"ExtractionError",
"Severity",
"IssueType",
"CellLocation",
# 核心类
"DocxTableExtractor",
"ArithmeticValidator",
"StatValidator",
# 路由
"forensics_router",
]
__version__ = "2.0.0"

View File

@@ -0,0 +1,221 @@
"""
数据侦探模块 - FastAPI 路由
提供 /api/v1/forensics/* 接口
API 端点:
- GET /api/v1/forensics/health - 健康检查
- POST /api/v1/forensics/analyze_docx - 分析 Word 文档
- GET /api/v1/forensics/supported_formats - 获取支持的格式
"""
from fastapi import APIRouter, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
from loguru import logger
from pathlib import Path
import os
import time
from .types import ForensicsConfig, ForensicsResult, Severity
from .config import (
validate_file_size,
validate_file_extension,
detect_methods,
MAX_FILE_SIZE_BYTES,
ALLOWED_EXTENSIONS,
)
from .extractor import DocxTableExtractor
from .validator import ArithmeticValidator, StatValidator
# 创建路由器
router = APIRouter(prefix="/api/v1/forensics", tags=["forensics"])
# 临时文件目录
TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service"))
TEMP_DIR.mkdir(parents=True, exist_ok=True)
@router.get("/health")
async def forensics_health():
"""
数据侦探模块健康检查
"""
try:
# 检查依赖
import docx
import pandas
import scipy
return {
"status": "healthy",
"module": "forensics",
"version": "2.0.0",
"dependencies": {
"python-docx": docx.__version__ if hasattr(docx, '__version__') else "unknown",
"pandas": pandas.__version__,
"scipy": scipy.__version__,
}
}
except ImportError as e:
return {
"status": "degraded",
"module": "forensics",
"error": f"Missing dependency: {e}"
}
@router.post("/analyze_docx")
async def analyze_docx(
file: UploadFile = File(...),
check_level: str = "L1_L2",
tolerance_percent: float = 0.1,
max_table_rows: int = 500
):
"""
分析 Word 文档表格数据
Args:
file: 上传的 .docx 文件
check_level: 验证级别 (L1 / L1_L2)
tolerance_percent: 百分比容错范围
max_table_rows: 单表最大行数
Returns:
ForensicsResult: 分析结果包含表格、HTML、问题列表
"""
temp_path = None
start_time = time.time()
try:
# 1. 验证文件扩展名
is_valid, error_msg = validate_file_extension(file.filename)
if not is_valid:
logger.warning(f"文件格式校验失败: {file.filename} - {error_msg}")
raise HTTPException(status_code=400, detail=error_msg)
# 2. 读取文件内容
content = await file.read()
file_size = len(content)
# 3. 验证文件大小
is_valid, error_msg = validate_file_size(file_size)
if not is_valid:
logger.warning(f"文件大小校验失败: {file.filename} - {error_msg}")
raise HTTPException(status_code=400, detail=error_msg)
logger.info(f"开始分析 Word 文档: {file.filename}, 大小: {file_size/1024:.1f}KB")
# 4. 保存临时文件
temp_path = TEMP_DIR / f"forensics_{os.getpid()}_{file.filename}"
with open(temp_path, "wb") as f:
f.write(content)
# 5. 创建配置
config = ForensicsConfig(
check_level=check_level,
tolerance_percent=tolerance_percent,
max_table_rows=max_table_rows
)
# 6. 提取表格
extractor = DocxTableExtractor(config)
tables, full_text = extractor.extract(str(temp_path))
# 7. 检测统计方法
methods_found = detect_methods(full_text)
logger.info(f"检测到统计方法: {methods_found}")
# 8. L1 算术验证
arithmetic_validator = ArithmeticValidator(config)
for table in tables:
if not table.skipped:
arithmetic_validator.validate(table)
# 9. L2 统计验证(如果启用)
if check_level == "L1_L2":
stat_validator = StatValidator(config)
for table in tables:
if not table.skipped:
stat_validator.validate(table, full_text)
# 10. 统计问题数量
total_issues = 0
error_count = 0
warning_count = 0
for table in tables:
for issue in table.issues:
total_issues += 1
if issue.severity == Severity.ERROR:
error_count += 1
elif issue.severity == Severity.WARNING:
warning_count += 1
execution_time_ms = int((time.time() - start_time) * 1000)
# 11. 构建结果
result = ForensicsResult(
success=True,
methods_found=methods_found,
tables=tables,
total_issues=total_issues,
error_count=error_count,
warning_count=warning_count,
execution_time_ms=execution_time_ms,
error=None,
fallback_available=True
)
logger.info(
f"分析完成: {file.filename}, "
f"表格: {len(tables)}, "
f"问题: {total_issues} (ERROR: {error_count}, WARNING: {warning_count}), "
f"耗时: {execution_time_ms}ms"
)
return JSONResponse(content=result.model_dump())
except HTTPException:
raise
except Exception as e:
logger.error(f"分析失败: {file.filename} - {str(e)}")
execution_time_ms = int((time.time() - start_time) * 1000)
# 返回失败结果(支持降级)
result = ForensicsResult(
success=False,
methods_found=[],
tables=[],
total_issues=0,
error_count=0,
warning_count=0,
execution_time_ms=execution_time_ms,
error=str(e),
fallback_available=True
)
return JSONResponse(
status_code=500,
content=result.model_dump()
)
finally:
# 清理临时文件
if temp_path and temp_path.exists():
try:
os.remove(temp_path)
except Exception as e:
logger.warning(f"清理临时文件失败: {e}")
@router.get("/supported_formats")
async def supported_formats():
"""
获取支持的文件格式
"""
return {
"formats": list(ALLOWED_EXTENSIONS),
"max_file_size_mb": MAX_FILE_SIZE_BYTES / 1024 / 1024,
"note": "MVP 阶段仅支持 .docx 格式,.doc 文件请先用 Word 另存为 .docx"
}

View File

@@ -0,0 +1,182 @@
"""
数据侦探模块 - 配置和常量
包含文件限制、正则表达式、默认配置等。
"""
import re
from typing import Dict, Pattern
# ==================== 文件限制 ====================
MAX_FILE_SIZE_MB = 20 # 最大文件大小MB
MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
MAX_TABLE_ROWS = 500 # 单表最大行数
MAX_TABLES_PER_DOC = 50 # 单文档最大表格数
ALLOWED_EXTENSIONS = {".docx"} # MVP 仅支持 .docx
# ==================== 正则表达式 ====================
# n (%) 格式匹配,如 "45 (50.0%)" 或 "45(50%)"
PERCENT_PATTERN = re.compile(
r"(\d+(?:\.\d+)?)\s*\(\s*(\d+(?:\.\d+)?)\s*%?\s*\)",
re.IGNORECASE
)
# P 值匹配,如 "P=0.05" 或 "p < 0.001" 或 "P值=0.05"
PVALUE_PATTERN = re.compile(
r"[Pp][\s\-值]*[=<>≤≥]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# 置信区间匹配,如 "95% CI: 1.2-2.5" 或 "(1.2, 2.5)"
CI_PATTERN = re.compile(
r"(?:95%?\s*CI[:\s]*)?[\(\[]?\s*(\d+\.?\d*)\s*[-,]\s*(\d+\.?\d*)\s*[\)\]]?",
re.IGNORECASE
)
# OR/HR/RR 匹配
EFFECT_SIZE_PATTERN = re.compile(
r"(?:OR|HR|RR)\s*[=:]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# ==================== 统计方法检测 ====================
METHOD_PATTERNS: Dict[str, Pattern] = {
"t-test": re.compile(
r"(t[\s\-]?test|t[\s\-]?检验|student.*test|independent.*sample|独立样本|两样本)",
re.IGNORECASE
),
"chi-square": re.compile(
r"(chi[\s\-]?square|χ2|χ²|卡方|pearson.*chi|fisher.*exact|fisher精确)",
re.IGNORECASE
),
"anova": re.compile(
r"(anova|analysis\s+of\s+variance|方差分析|单因素|多因素)",
re.IGNORECASE
),
"logistic": re.compile(
r"(logistic\s+regression|逻辑回归|二元回归|logit)",
re.IGNORECASE
),
"cox": re.compile(
r"(cox\s+regression|cox\s+proportional|生存分析|比例风险|kaplan[\s\-]?meier)",
re.IGNORECASE
),
"mann-whitney": re.compile(
r"(mann[\s\-]?whitney|wilcoxon|秩和检验|非参数)",
re.IGNORECASE
),
"paired-t": re.compile(
r"(paired[\s\-]?t|配对.*t|before[\s\-]?after)",
re.IGNORECASE
),
}
# ==================== 表格类型检测 ====================
# 基线特征表关键词
BASELINE_KEYWORDS = [
"baseline", "characteristics", "demographic", "基线", "特征", "人口学"
]
# 结局表关键词
OUTCOME_KEYWORDS = [
"outcome", "result", "efficacy", "endpoint", "结局", "疗效", "终点"
]
# ==================== 容错配置(终审建议) ====================
DEFAULT_TOLERANCE_PERCENT = 0.1 # 百分比容错 ±0.1%
# P 值容错阈值
PVALUE_ERROR_THRESHOLD = 0.05 # P 值差异 > 0.05 → Error严重矛盾
PVALUE_WARNING_THRESHOLD = 0.01 # P 值差异 > 0.01 → Warning可能舍入误差
PVALUE_RELATIVE_TOLERANCE = 0.05 # P 值相对误差 ±5%
# CI 容错阈值
CI_RELATIVE_TOLERANCE = 0.02 # CI 端点相对误差 ±2%
# 统计量容错
STAT_RELATIVE_TOLERANCE = 0.05 # t/χ² 值相对误差 ±5%
# ==================== Mean±SD 正则表达式 ====================
# Mean ± SD 格式,如 "45.2 ± 12.3" 或 "45.2±12.3" 或 "45.2 (12.3)"
MEAN_SD_PATTERN = re.compile(
r"(\d+\.?\d*)\s*[±\+\-]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# 带括号的 SD 格式,如 "45.2 (12.3)" - 用于某些表格
MEAN_SD_PAREN_PATTERN = re.compile(
r"(\d+\.?\d*)\s*\(\s*(\d+\.?\d*)\s*\)(?!\s*%)", # 排除百分比格式
re.IGNORECASE
)
# CI 格式清洗器(终审建议:处理多种分隔符)
CI_PATTERNS = [
# 标准格式: 2.5 (1.1-3.5) 或 2.5 [1.1-3.5]
re.compile(r"[\(\[]\s*(\d+\.?\d*)\s*[-–—,;]\s*(\d+\.?\d*)\s*[\)\]]", re.IGNORECASE),
# 带 CI 标签: 95% CI: 1.1-3.5 或 95%CI 1.1 to 3.5
re.compile(r"95%?\s*CI\s*[:\s]+(\d+\.?\d*)\s*[-–—,;to]+\s*(\d+\.?\d*)", re.IGNORECASE),
# 简单范围: 1.1-3.5(需要上下文判断)
re.compile(r"(\d+\.?\d*)\s*[-–—]\s*(\d+\.?\d*)", re.IGNORECASE),
]
# ==================== 验证函数 ====================
def validate_file_size(size_bytes: int) -> tuple[bool, str]:
"""
验证文件大小
Returns:
(is_valid, error_message)
"""
if size_bytes > MAX_FILE_SIZE_BYTES:
return False, f"文件大小 ({size_bytes / 1024 / 1024:.1f}MB) 超过限制 ({MAX_FILE_SIZE_MB}MB)"
return True, ""
def validate_file_extension(filename: str) -> tuple[bool, str]:
"""
验证文件扩展名
Returns:
(is_valid, error_message)
"""
from pathlib import Path
ext = Path(filename).suffix.lower()
if ext not in ALLOWED_EXTENSIONS:
if ext == ".doc":
return False, "暂不支持 .doc 格式,请使用 Word 另存为 .docx 格式后重新上传"
return False, f"不支持的文件格式: {ext},仅支持 .docx"
return True, ""
def detect_methods(text: str) -> list[str]:
"""
检测文本中的统计方法(正则优先)
Args:
text: 文档全文
Returns:
检测到的方法列表
"""
found = []
for method_name, pattern in METHOD_PATTERNS.items():
if pattern.search(text):
found.append(method_name)
return found

View File

@@ -0,0 +1,340 @@
"""
数据侦探模块 - Word 表格提取器
使用 python-docx 解析 Word 文档,提取表格数据并生成 HTML 片段。
功能:
- 解析 Word DOM 结构
- 处理合并单元格Forward Fill 策略)
- 关联表格 Caption向前回溯
- 生成 HTML 片段(含 data-coord 属性)
"""
from docx import Document
from docx.table import Table, _Cell
from docx.text.paragraph import Paragraph
from loguru import logger
from typing import List, Optional, Tuple
import re
from .types import TableData, Issue, Severity, IssueType, CellLocation, ForensicsConfig
from .config import (
MAX_TABLE_ROWS,
MAX_TABLES_PER_DOC,
BASELINE_KEYWORDS,
OUTCOME_KEYWORDS,
)
class DocxTableExtractor:
"""
Word 表格提取器
提取 .docx 文件中的所有表格,处理合并单元格,生成 HTML 片段。
"""
def __init__(self, config: ForensicsConfig):
self.config = config
self.max_table_rows = config.max_table_rows
def extract(self, file_path: str) -> Tuple[List[TableData], str]:
"""
提取 Word 文档中的所有表格
Args:
file_path: .docx 文件路径
Returns:
(tables, full_text): 表格列表和全文文本
"""
logger.info(f"开始提取表格: {file_path}")
try:
doc = Document(file_path)
except Exception as e:
logger.error(f"无法打开 Word 文档: {e}")
raise ValueError(f"无法打开 Word 文档: {e}")
tables: List[TableData] = []
full_text_parts: List[str] = []
# 收集所有段落文本(用于方法检测)
for para in doc.paragraphs:
full_text_parts.append(para.text)
# 遍历文档元素,关联表格和 Caption
table_index = 0
prev_paragraphs: List[str] = []
for element in doc.element.body:
# 段落元素
if element.tag.endswith('p'):
para = Paragraph(element, doc)
prev_paragraphs.append(para.text.strip())
# 只保留最近 3 个段落用于 Caption 匹配
if len(prev_paragraphs) > 3:
prev_paragraphs.pop(0)
# 表格元素
elif element.tag.endswith('tbl'):
if table_index >= MAX_TABLES_PER_DOC:
logger.warning(f"表格数量超过限制 ({MAX_TABLES_PER_DOC}),跳过剩余表格")
break
# 获取 python-docx Table 对象
table = Table(element, doc)
# 提取 Caption
caption = self._find_caption(prev_paragraphs)
# 提取表格数据
table_data = self._extract_table(
table=table,
table_id=f"tbl_{table_index}",
caption=caption
)
tables.append(table_data)
table_index += 1
# 清空前置段落
prev_paragraphs = []
full_text = "\n".join(full_text_parts)
logger.info(f"提取完成: {len(tables)} 个表格, {len(full_text)} 字符")
return tables, full_text
def _find_caption(self, prev_paragraphs: List[str]) -> Optional[str]:
"""
从前置段落中查找表格 Caption
匹配模式:
- "Table 1. xxx""表 1 xxx"
- "Table 1: xxx"
"""
caption_pattern = re.compile(
r"^(Table|表)\s*\d+[\.:\s]",
re.IGNORECASE
)
# 从后向前查找
for para in reversed(prev_paragraphs):
if para and caption_pattern.match(para):
return para
return None
def _extract_table(
self,
table: Table,
table_id: str,
caption: Optional[str]
) -> TableData:
"""
提取单个表格数据
Args:
table: python-docx Table 对象
table_id: 表格 ID
caption: 表格标题
Returns:
TableData 对象
"""
rows = table.rows
row_count = len(rows)
col_count = len(rows[0].cells) if rows else 0
# 检查是否超过行数限制
if row_count > self.max_table_rows:
logger.warning(f"表格 {table_id} 行数 ({row_count}) 超过限制 ({self.max_table_rows}),跳过")
return TableData(
id=table_id,
caption=caption,
type=self._detect_table_type(caption),
row_count=row_count,
col_count=col_count,
html=f"<p class='warning'>表格行数 ({row_count}) 超过限制 ({self.max_table_rows}),已跳过</p>",
data=[],
issues=[
Issue(
severity=Severity.WARNING,
type=IssueType.TABLE_SKIPPED,
message=f"表格行数 ({row_count}) 超过限制 ({self.max_table_rows})",
location=CellLocation(table_id=table_id, row=1, col=1),
evidence={"row_count": row_count, "max_rows": self.max_table_rows}
)
],
skipped=True,
skip_reason=f"行数超限: {row_count} > {self.max_table_rows}"
)
# 提取原始数据(处理合并单元格)
data = self._extract_with_merge_handling(table)
# 生成 HTML
html = self._generate_html(table_id, caption, data)
# 检测表格类型
table_type = self._detect_table_type(caption)
return TableData(
id=table_id,
caption=caption,
type=table_type,
row_count=len(data),
col_count=len(data[0]) if data else 0,
html=html,
data=data,
issues=[],
skipped=False,
skip_reason=None
)
def _extract_with_merge_handling(self, table: Table) -> List[List[str]]:
"""
提取表格数据,处理合并单元格
使用 Forward Fill 策略:
- 水平合并:将值复制到所有合并的单元格
- 垂直合并:将上方单元格的值填充到下方
"""
rows = table.rows
if not rows:
return []
# 首先获取表格的真实维度
num_rows = len(rows)
num_cols = len(rows[0].cells)
# 初始化数据矩阵
data: List[List[str]] = [["" for _ in range(num_cols)] for _ in range(num_rows)]
# 记录每个单元格是否已被处理(用于处理合并单元格)
processed = [[False for _ in range(num_cols)] for _ in range(num_rows)]
for row_idx, row in enumerate(rows):
col_idx = 0
for cell in row.cells:
# 跳过已处理的单元格(合并单元格的一部分)
while col_idx < num_cols and processed[row_idx][col_idx]:
col_idx += 1
if col_idx >= num_cols:
break
# 获取单元格文本
cell_text = self._get_cell_text(cell)
# 检测合并范围
# python-docx 中合并单元格会重复出现同一个 cell 对象
# 我们通过比较 cell._tc 来检测
merge_width = 1
merge_height = 1
# 检测水平合并
for next_col in range(col_idx + 1, num_cols):
if next_col < len(row.cells):
next_cell = row.cells[next_col]
if next_cell._tc is cell._tc:
merge_width += 1
else:
break
# 填充数据
for r in range(row_idx, min(row_idx + merge_height, num_rows)):
for c in range(col_idx, min(col_idx + merge_width, num_cols)):
data[r][c] = cell_text
processed[r][c] = True
col_idx += merge_width
return data
def _get_cell_text(self, cell: _Cell) -> str:
"""
获取单元格文本(合并多个段落)
"""
paragraphs = cell.paragraphs
texts = [p.text.strip() for p in paragraphs]
return " ".join(texts).strip()
def _generate_html(
self,
table_id: str,
caption: Optional[str],
data: List[List[str]]
) -> str:
"""
生成 HTML 片段,包含 data-coord 属性用于前端高亮
"""
if not data:
return f"<table id='{table_id}' class='forensics-table'><tr><td>空表格</td></tr></table>"
html_parts = [f"<table id='{table_id}' class='forensics-table'>"]
# 添加 Caption
if caption:
html_parts.append(f" <caption>{self._escape_html(caption)}</caption>")
# 添加表头(假设第一行是表头)
html_parts.append(" <thead>")
html_parts.append(" <tr>")
for col_idx, cell in enumerate(data[0], start=1):
coord = f"R1C{col_idx}"
html_parts.append(
f' <th data-coord="{coord}">{self._escape_html(cell)}</th>'
)
html_parts.append(" </tr>")
html_parts.append(" </thead>")
# 添加表体
html_parts.append(" <tbody>")
for row_idx, row in enumerate(data[1:], start=2):
html_parts.append(" <tr>")
for col_idx, cell in enumerate(row, start=1):
coord = f"R{row_idx}C{col_idx}"
html_parts.append(
f' <td data-coord="{coord}">{self._escape_html(cell)}</td>'
)
html_parts.append(" </tr>")
html_parts.append(" </tbody>")
html_parts.append("</table>")
return "\n".join(html_parts)
def _escape_html(self, text: str) -> str:
"""转义 HTML 特殊字符"""
return (
text
.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace('"', "&quot;")
.replace("'", "&#39;")
)
def _detect_table_type(self, caption: Optional[str]) -> str:
"""
检测表格类型
Returns:
BASELINE / OUTCOME / OTHER
"""
if not caption:
return "OTHER"
caption_lower = caption.lower()
for keyword in BASELINE_KEYWORDS:
if keyword in caption_lower:
return "BASELINE"
for keyword in OUTCOME_KEYWORDS:
if keyword in caption_lower:
return "OUTCOME"
return "OTHER"

View File

@@ -0,0 +1,114 @@
"""
数据侦探模块 - 类型定义
定义所有数据结构,确保类型安全和接口一致性。
"""
from pydantic import BaseModel, Field
from typing import List, Dict, Any, Optional
from enum import Enum
class Severity(str, Enum):
"""问题严重程度"""
ERROR = "ERROR" # 严重错误,可能是数据造假
WARNING = "WARNING" # 警告,需要人工复核
INFO = "INFO" # 提示信息
class IssueType(str, Enum):
"""问题类型"""
# L1 算术错误
ARITHMETIC_PERCENT = "ARITHMETIC_PERCENT" # 百分比计算错误
ARITHMETIC_SUM = "ARITHMETIC_SUM" # 合计计算错误
ARITHMETIC_TOTAL = "ARITHMETIC_TOTAL" # Total 行错误
# L2 统计错误
STAT_TTEST_PVALUE = "STAT_TTEST_PVALUE" # T检验 P 值错误
STAT_CHI2_PVALUE = "STAT_CHI2_PVALUE" # 卡方检验 P 值错误
STAT_CI_PVALUE_CONFLICT = "STAT_CI_PVALUE_CONFLICT" # CI 与 P 值逻辑矛盾
# L2.5 一致性取证(终审提权)
STAT_SE_TRIANGLE = "STAT_SE_TRIANGLE" # SE 三角验证不一致
STAT_SD_GREATER_MEAN = "STAT_SD_GREATER_MEAN" # SD > Mean正值指标
STAT_REGRESSION_CI_P = "STAT_REGRESSION_CI_P" # 回归系数 CI↔P 不一致
# 提取问题
EXTRACTION_WARNING = "EXTRACTION_WARNING" # 提取警告
TABLE_SKIPPED = "TABLE_SKIPPED" # 表格被跳过(超限)
class ForensicsConfig(BaseModel):
"""数据侦探配置"""
check_level: str = Field(
default="L1_L2",
description="验证级别L1仅算术、L1_L2算术+基础统计)"
)
tolerance_percent: float = Field(
default=0.1,
description="百分比容错范围,默认 0.1%"
)
max_table_rows: int = Field(
default=500,
description="单表最大行数,超出跳过"
)
max_file_size_mb: int = Field(
default=20,
description="最大文件大小MB"
)
class CellLocation(BaseModel):
"""单元格位置R1C1 坐标)"""
table_id: str = Field(..., description="表格 ID如 tbl_0")
row: int = Field(..., description="行号,从 1 开始")
col: int = Field(..., description="列号,从 1 开始")
@property
def cell_ref(self) -> str:
"""返回 R1C1 格式的坐标"""
return f"R{self.row}C{self.col}"
class Issue(BaseModel):
"""发现的问题"""
severity: Severity = Field(..., description="严重程度")
type: IssueType = Field(..., description="问题类型")
message: str = Field(..., description="人类可读的问题描述")
location: Optional[CellLocation] = Field(None, description="问题位置")
evidence: Optional[Dict[str, Any]] = Field(None, description="证据数据")
class TableData(BaseModel):
"""提取的表格数据"""
id: str = Field(..., description="表格 ID如 tbl_0")
caption: Optional[str] = Field(None, description="表格标题")
type: Optional[str] = Field(None, description="表格类型BASELINE/OUTCOME/OTHER")
row_count: int = Field(..., description="行数")
col_count: int = Field(..., description="列数")
html: str = Field(..., description="预渲染的 HTML 片段")
data: List[List[str]] = Field(..., description="二维数组数据")
issues: List[Issue] = Field(default_factory=list, description="该表格的问题列表")
skipped: bool = Field(default=False, description="是否被跳过(超限)")
skip_reason: Optional[str] = Field(None, description="跳过原因")
class ForensicsResult(BaseModel):
"""数据侦探分析结果"""
success: bool = Field(..., description="是否成功")
methods_found: List[str] = Field(default_factory=list, description="检测到的统计方法")
tables: List[TableData] = Field(default_factory=list, description="表格列表")
total_issues: int = Field(default=0, description="总问题数")
error_count: int = Field(default=0, description="ERROR 级别问题数")
warning_count: int = Field(default=0, description="WARNING 级别问题数")
execution_time_ms: int = Field(default=0, description="执行时间(毫秒)")
error: Optional[str] = Field(None, description="错误信息(如果失败)")
fallback_available: bool = Field(default=True, description="是否可降级执行")
class ExtractionError(Exception):
"""提取错误异常"""
def __init__(self, message: str, code: str = "EXTRACTION_FAILED"):
self.message = message
self.code = code
super().__init__(self.message)

View File

@@ -0,0 +1,839 @@
"""
数据侦探模块 - 验证器
包含 L1 算术验证、L2 统计验证、L2.5 一致性取证。
L1 算术验证:
- n (%) 格式验证
- Sum/Total 校验
- 容错逻辑
L2 统计验证:
- T 检验 P 值逆向验证
- 卡方检验 P 值逆向验证
- CI vs P 值逻辑检查
L2.5 一致性取证(终审提权):
- SE 三角验证(回归系数 CI↔P 一致性)
- SD > Mean 检查(正值指标启发式规则)
"""
import re
import math
from typing import List, Optional, Tuple
from loguru import logger
# scipy 用于统计计算
try:
from scipy import stats
SCIPY_AVAILABLE = True
except ImportError:
SCIPY_AVAILABLE = False
logger.warning("scipy 未安装L2 统计验证将受限")
from .types import (
TableData,
Issue,
Severity,
IssueType,
CellLocation,
ForensicsConfig,
)
from .config import (
PERCENT_PATTERN,
PVALUE_PATTERN,
CI_PATTERN,
MEAN_SD_PATTERN,
MEAN_SD_PAREN_PATTERN,
CI_PATTERNS,
EFFECT_SIZE_PATTERN,
DEFAULT_TOLERANCE_PERCENT,
PVALUE_ERROR_THRESHOLD,
PVALUE_WARNING_THRESHOLD,
STAT_RELATIVE_TOLERANCE,
)
class ArithmeticValidator:
"""
L1 算术自洽性验证器
验证表格中的数值计算是否正确:
- n (%) 格式中的百分比是否等于 n/N
- Total/Sum 行是否等于其他行之和
"""
def __init__(self, config: ForensicsConfig):
self.config = config
self.tolerance = config.tolerance_percent
def validate(self, table: TableData) -> List[Issue]:
"""
验证表格的算术一致性
Args:
table: 要验证的表格数据
Returns:
发现的问题列表
"""
if table.skipped or not table.data:
return []
issues: List[Issue] = []
# 1. 验证 n (%) 格式
percent_issues = self._validate_percent_format(table)
issues.extend(percent_issues)
# 2. 验证 Sum/Total 行
sum_issues = self._validate_sum_rows(table)
issues.extend(sum_issues)
# 更新表格的 issues
table.issues.extend(issues)
logger.debug(f"表格 {table.id} 算术验证完成: {len(issues)} 个问题")
return issues
def _validate_percent_format(self, table: TableData) -> List[Issue]:
"""
验证 n (%) 格式
查找形如 "45 (50.0%)" 的单元格,验证百分比是否正确。
需要从表头或同行找到总数 N。
"""
issues: List[Issue] = []
data = table.data
if len(data) < 2: # 至少需要表头和一行数据
return issues
# 尝试从表头识别 N 列(如 "n", "N", "Total", "合计"
header = data[0]
n_col_indices = self._find_n_columns(header)
for row_idx, row in enumerate(data[1:], start=2): # 从第2行开始数据行
for col_idx, cell in enumerate(row, start=1):
# 查找 n (%) 格式
match = PERCENT_PATTERN.search(cell)
if match:
n_value = float(match.group(1))
reported_percent = float(match.group(2))
# 尝试找到对应的 N 值
total_n = self._find_total_n(data, row_idx - 1, col_idx - 1, n_col_indices)
if total_n is not None and total_n > 0:
# 计算实际百分比
calculated_percent = (n_value / total_n) * 100
# 检查差异
diff = abs(calculated_percent - reported_percent)
if diff > self.tolerance:
issues.append(Issue(
severity=Severity.ERROR,
type=IssueType.ARITHMETIC_PERCENT,
message=f"百分比计算错误: 报告值 {reported_percent}%,计算值 {calculated_percent:.1f}% (n={n_value}, N={total_n})",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=col_idx
),
evidence={
"n": n_value,
"N": total_n,
"reported_percent": reported_percent,
"calculated_percent": round(calculated_percent, 2),
"difference": round(diff, 2)
}
))
return issues
def _find_n_columns(self, header: List[str]) -> List[int]:
"""
从表头识别可能包含 N 值的列索引
"""
n_keywords = ["n", "total", "合计", "总数", "all", "sum"]
indices = []
for idx, cell in enumerate(header):
cell_lower = cell.lower().strip()
for keyword in n_keywords:
if keyword in cell_lower:
indices.append(idx)
break
return indices
def _find_total_n(
self,
data: List[List[str]],
row_idx: int,
col_idx: int,
n_col_indices: List[int]
) -> Optional[float]:
"""
查找对应的总数 N
策略:
1. 首先检查同行的 N 列
2. 如果没有,检查表头行对应位置
3. 尝试解析同列第一个纯数字
"""
row = data[row_idx]
# 策略 1检查同行的 N 列
for n_col in n_col_indices:
if n_col < len(row):
n_val = self._parse_number(row[n_col])
if n_val is not None and n_val > 0:
return n_val
# 策略 2检查同列的第一行可能是 N 值)
if row_idx > 0:
first_data_row = data[1] if len(data) > 1 else None
if first_data_row and col_idx < len(first_data_row):
# 检查是否该列第一行就是数字Total N
n_val = self._parse_number(first_data_row[col_idx])
if n_val is not None and n_val > 0:
return n_val
# 策略 3尝试从同行其他单元格累加
# 这是一个启发式方法,可能不准确
return None
def _parse_number(self, text: str) -> Optional[float]:
"""
从文本中解析数字
处理:
- 纯数字 "45"
- 带逗号 "1,234"
- 带空格 "1 234"
"""
if not text:
return None
# 移除常见分隔符
cleaned = text.strip().replace(",", "").replace(" ", "")
# 尝试提取第一个数字
match = re.match(r"^(\d+(?:\.\d+)?)", cleaned)
if match:
try:
return float(match.group(1))
except ValueError:
return None
return None
def _validate_sum_rows(self, table: TableData) -> List[Issue]:
"""
验证 Sum/Total 行
查找标记为 "Total", "Sum", "合计" 的行,验证其值是否等于上方各行之和。
"""
issues: List[Issue] = []
data = table.data
if len(data) < 3: # 至少需要表头、数据行和合计行
return issues
# 查找 Total/Sum 行
total_keywords = ["total", "sum", "合计", "总计", "总和", "all"]
for row_idx, row in enumerate(data[1:], start=2): # 跳过表头
first_cell = row[0].lower().strip() if row else ""
is_total_row = any(kw in first_cell for kw in total_keywords)
if is_total_row:
# 验证每个数值列
for col_idx, cell in enumerate(row[1:], start=2): # 跳过第一列
total_val = self._parse_number(cell)
if total_val is None:
continue
# 计算上方各行的和
column_sum = 0.0
valid_sum = True
for prev_row_idx in range(1, row_idx - 1): # 从第一个数据行到当前行的上一行
if col_idx - 1 < len(data[prev_row_idx]):
prev_cell = data[prev_row_idx][col_idx - 1]
prev_val = self._parse_number(prev_cell)
if prev_val is not None:
column_sum += prev_val
else:
# 如果有非数字单元格,跳过验证
valid_sum = False
break
if valid_sum and column_sum > 0:
diff = abs(total_val - column_sum)
# 允许小数点误差
if diff > 0.5: # 容错 0.5
issues.append(Issue(
severity=Severity.ERROR,
type=IssueType.ARITHMETIC_SUM,
message=f"合计行计算错误: 报告值 {total_val},计算值 {column_sum}",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=col_idx
),
evidence={
"reported_total": total_val,
"calculated_sum": column_sum,
"difference": round(diff, 2)
}
))
return issues
class StatValidator:
"""
L2 统计学复核验证器 + L2.5 一致性取证
验证统计检验结果的合理性:
- T 检验 P 值逆向验证
- 卡方检验 P 值逆向验证(基于频数表)
- CI 与 P 值逻辑一致性检查
- SE 三角验证(回归系数 CI↔P 一致性)
- SD > Mean 检查(正值指标启发式规则)
"""
def __init__(self, config: ForensicsConfig):
self.config = config
def validate(self, table: TableData, full_text: str) -> List[Issue]:
"""
验证表格的统计学一致性
Args:
table: 要验证的表格数据
full_text: 文档全文(用于方法识别)
Returns:
发现的问题列表
"""
if table.skipped or not table.data:
return []
# 仅在 L1_L2 模式下执行
if self.config.check_level != "L1_L2":
return []
issues: List[Issue] = []
# 1. CI vs P 值逻辑检查(基础)
ci_issues = self._validate_ci_pvalue_consistency(table)
issues.extend(ci_issues)
# 2. T 检验逆向验证
if SCIPY_AVAILABLE:
ttest_issues = self._validate_ttest(table)
issues.extend(ttest_issues)
# 3. SE 三角验证(终审提权:回归系数 CI↔P 一致性)
se_issues = self._validate_se_triangle(table)
issues.extend(se_issues)
# 4. SD > Mean 检查(终审提权:启发式规则)
sd_issues = self._validate_sd_greater_mean(table)
issues.extend(sd_issues)
# 更新表格的 issues
table.issues.extend(issues)
logger.debug(f"表格 {table.id} 统计验证完成: {len(issues)} 个问题")
return issues
def _validate_ci_pvalue_consistency(self, table: TableData) -> List[Issue]:
"""
验证 CI 与 P 值的逻辑一致性
黄金法则:
- 若 95% CI 跨越 1.0(如 0.8-1.2)→ P 值必须 ≥ 0.05
- 若 95% CI 不跨越 1.0(如 1.1-1.5)→ P 值必须 < 0.05
违反此规则 = 数据逻辑矛盾
"""
issues: List[Issue] = []
data = table.data
for row_idx, row in enumerate(data[1:], start=2):
row_text = " ".join(row)
# 查找 CI使用增强的 CI 解析)
ci_result = self._parse_ci(row_text)
if ci_result is None:
continue
ci_lower, ci_upper = ci_result
# 查找 P 值
pvalue = self._parse_pvalue(row_text)
if pvalue is None:
continue
# 检查逻辑一致性
ci_crosses_one = ci_lower <= 1.0 <= ci_upper
p_significant = pvalue < 0.05
# 矛盾情况
if ci_crosses_one and p_significant:
# CI 跨越 1 但 P < 0.05,矛盾
issues.append(Issue(
severity=Severity.ERROR,
type=IssueType.STAT_CI_PVALUE_CONFLICT,
message=f"CI 与 P 值逻辑矛盾: 95% CI ({ci_lower}-{ci_upper}) 跨越 1.0,但 P={pvalue} < 0.05",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=1 # 整行问题
),
evidence={
"ci_lower": ci_lower,
"ci_upper": ci_upper,
"ci_crosses_one": ci_crosses_one,
"pvalue": pvalue,
"p_significant": p_significant
}
))
elif not ci_crosses_one and not p_significant:
# CI 不跨越 1 但 P ≥ 0.05,矛盾
issues.append(Issue(
severity=Severity.ERROR,
type=IssueType.STAT_CI_PVALUE_CONFLICT,
message=f"CI 与 P 值逻辑矛盾: 95% CI ({ci_lower}-{ci_upper}) 不跨越 1.0,但 P={pvalue} ≥ 0.05",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=1
),
evidence={
"ci_lower": ci_lower,
"ci_upper": ci_upper,
"ci_crosses_one": ci_crosses_one,
"pvalue": pvalue,
"p_significant": p_significant
}
))
return issues
def _validate_ttest(self, table: TableData) -> List[Issue]:
"""
T 检验逆向验证
从表格中提取 M±SD, n 信息,反推 t 值和 P 值,
与报告的 P 值进行对比。
公式: t = (M1 - M2) / sqrt(SD1²/n1 + SD2²/n2)
"""
issues: List[Issue] = []
if not SCIPY_AVAILABLE:
return issues
data = table.data
if len(data) < 2:
return issues
# 查找包含组比较数据的行
for row_idx, row in enumerate(data[1:], start=2):
# 尝试提取同一行中的两组数据
mean_sd_matches = list(MEAN_SD_PATTERN.finditer(" ".join(row)))
if len(mean_sd_matches) >= 2:
# 找到至少两组 Mean±SD 数据
try:
m1, sd1 = float(mean_sd_matches[0].group(1)), float(mean_sd_matches[0].group(2))
m2, sd2 = float(mean_sd_matches[1].group(1)), float(mean_sd_matches[1].group(2))
# 提取 P 值
row_text = " ".join(row)
pvalue = self._parse_pvalue(row_text)
if pvalue is None:
continue
# 尝试从表头获取样本量(简化处理,假设 n=30
# 实际实现需要更复杂的表格解析
n1, n2 = self._estimate_sample_sizes(table, row_idx)
if n1 is None or n2 is None:
continue
# 计算 t 值
se = math.sqrt(sd1**2/n1 + sd2**2/n2)
if se == 0:
continue
t_calc = abs(m1 - m2) / se
df = n1 + n2 - 2
# 计算 P 值
p_calc = 2 * (1 - stats.t.cdf(t_calc, df))
# 比较 P 值
p_diff = abs(p_calc - pvalue)
if p_diff > PVALUE_ERROR_THRESHOLD:
# 严重矛盾
issues.append(Issue(
severity=Severity.ERROR,
type=IssueType.STAT_TTEST_PVALUE,
message=f"T 检验 P 值不一致: 报告 P={pvalue},计算 P={p_calc:.4f}(差异 {p_diff:.3f}",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=1
),
evidence={
"group1": {"mean": m1, "sd": sd1, "n": n1},
"group2": {"mean": m2, "sd": sd2, "n": n2},
"t_calculated": round(t_calc, 3),
"df": df,
"p_calculated": round(p_calc, 4),
"p_reported": pvalue,
"p_difference": round(p_diff, 4)
}
))
elif p_diff > PVALUE_WARNING_THRESHOLD:
# 可能是舍入误差
issues.append(Issue(
severity=Severity.WARNING,
type=IssueType.STAT_TTEST_PVALUE,
message=f"T 检验 P 值轻微偏差: 报告 P={pvalue},计算 P={p_calc:.4f}(可能是舍入误差)",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=1
),
evidence={
"p_calculated": round(p_calc, 4),
"p_reported": pvalue,
"p_difference": round(p_diff, 4)
}
))
except (ValueError, TypeError, ZeroDivisionError) as e:
logger.debug(f"T 检验验证失败: {e}")
continue
return issues
def _validate_se_triangle(self, table: TableData) -> List[Issue]:
"""
SE 三角验证(终审提权)
用于 Logistic 回归、Cox 回归等场景。
原理:
- SE = (ln(CI_upper) - ln(CI_lower)) / 3.92
- Z = ln(OR) / SE
- P_calculated = 2 * (1 - norm.cdf(|Z|))
若报告的 P 值与计算的 P 值严重不一致,则存在问题。
"""
issues: List[Issue] = []
data = table.data
if not SCIPY_AVAILABLE:
return issues
for row_idx, row in enumerate(data[1:], start=2):
row_text = " ".join(row)
# 查找 OR/HR/RR
effect_match = EFFECT_SIZE_PATTERN.search(row_text)
if not effect_match:
continue
try:
effect_size = float(effect_match.group(1))
if effect_size <= 0:
continue
except (ValueError, TypeError):
continue
# 查找 CI
ci_result = self._parse_ci(row_text)
if ci_result is None:
continue
ci_lower, ci_upper = ci_result
# 确保 CI 有效(正数且 lower < upper
if ci_lower <= 0 or ci_upper <= 0 or ci_lower >= ci_upper:
continue
# 查找报告的 P 值
pvalue = self._parse_pvalue(row_text)
if pvalue is None:
continue
try:
# SE 三角计算
ln_effect = math.log(effect_size)
ln_ci_lower = math.log(ci_lower)
ln_ci_upper = math.log(ci_upper)
# SE = (ln(CI_upper) - ln(CI_lower)) / 3.92 (for 95% CI)
se = (ln_ci_upper - ln_ci_lower) / 3.92
if se <= 0:
continue
# Z = ln(OR) / SE
z = abs(ln_effect) / se
# P = 2 * (1 - norm.cdf(|Z|))
p_calc = 2 * (1 - stats.norm.cdf(z))
# 比较 P 值
p_diff = abs(p_calc - pvalue)
if p_diff > PVALUE_ERROR_THRESHOLD:
# 严重矛盾
issues.append(Issue(
severity=Severity.ERROR,
type=IssueType.STAT_SE_TRIANGLE,
message=f"SE 三角验证不一致: 报告 P={pvalue},由 CI 反推 P={p_calc:.4f}(差异 {p_diff:.3f}",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=1
),
evidence={
"effect_size": effect_size,
"ci_lower": ci_lower,
"ci_upper": ci_upper,
"se_calculated": round(se, 4),
"z_calculated": round(z, 3),
"p_calculated": round(p_calc, 4),
"p_reported": pvalue,
"p_difference": round(p_diff, 4)
}
))
elif p_diff > PVALUE_WARNING_THRESHOLD:
# 轻微偏差,可能是舍入误差
issues.append(Issue(
severity=Severity.WARNING,
type=IssueType.STAT_SE_TRIANGLE,
message=f"SE 三角验证轻微偏差: 报告 P={pvalue},由 CI 反推 P={p_calc:.4f}(可能是舍入误差)",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=1
),
evidence={
"effect_size": effect_size,
"p_calculated": round(p_calc, 4),
"p_reported": pvalue,
"p_difference": round(p_diff, 4)
}
))
except (ValueError, ZeroDivisionError, TypeError) as e:
logger.debug(f"SE 三角验证失败: {e}")
continue
return issues
def _validate_sd_greater_mean(self, table: TableData) -> List[Issue]:
"""
SD > Mean 启发式检查(终审提权)
对于正值指标(如年龄、体重、血压、实验室指标),
SD > Mean 通常是不合理的,可能暗示数据问题。
例外情况:
- 差值指标(可正可负)
- 某些偏态分布指标
"""
issues: List[Issue] = []
data = table.data
# 识别表头,判断哪些列是正值指标
if len(data) < 2:
return issues
header = data[0]
# 正值指标的关键词(这些指标通常不应有 SD > Mean
positive_indicators = [
"age", "年龄", "weight", "体重", "bmi", "height", "身高",
"sbp", "dbp", "血压", "heart rate", "心率", "pulse", "脉搏",
"wbc", "rbc", "hgb", "plt", "白细胞", "红细胞", "血红蛋白", "血小板",
"creatinine", "肌酐", "bun", "尿素氮", "glucose", "血糖",
"alt", "ast", "转氨酶", "bilirubin", "胆红素",
"cost", "费用", "time", "时间", "duration", "持续"
]
for row_idx, row in enumerate(data[1:], start=2):
for col_idx, cell in enumerate(row, start=1):
# 检查 Mean±SD 格式
match = MEAN_SD_PATTERN.search(cell)
if not match:
# 尝试括号格式
match = MEAN_SD_PAREN_PATTERN.search(cell)
if not match:
continue
try:
mean_val = float(match.group(1))
sd_val = float(match.group(2))
except (ValueError, TypeError):
continue
# 检查 SD > Mean仅对 mean > 0 的情况)
if mean_val > 0 and sd_val > mean_val:
# 检查是否是正值指标(通过表头或行首判断)
context_text = ""
if col_idx - 1 < len(header):
context_text += header[col_idx - 1].lower()
if len(row) > 0:
context_text += " " + row[0].lower()
# 判断是否是已知的正值指标
is_positive_indicator = any(kw in context_text for kw in positive_indicators)
# 计算 CV变异系数
cv = sd_val / mean_val if mean_val != 0 else 0
if is_positive_indicator:
# 已知正值指标SD > Mean 是错误
issues.append(Issue(
severity=Severity.ERROR,
type=IssueType.STAT_SD_GREATER_MEAN,
message=f"SD 大于 Mean 异常: {mean_val}±{sd_val}CV={cv:.1%},该指标通常为正值",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=col_idx
),
evidence={
"mean": mean_val,
"sd": sd_val,
"cv": round(cv, 3),
"context": context_text[:50]
}
))
else:
# 未确定的指标,给出警告
issues.append(Issue(
severity=Severity.WARNING,
type=IssueType.STAT_SD_GREATER_MEAN,
message=f"SD 大于 Mean: {mean_val}±{sd_val}CV={cv:.1%},建议核查数据分布",
location=CellLocation(
table_id=table.id,
row=row_idx,
col=col_idx
),
evidence={
"mean": mean_val,
"sd": sd_val,
"cv": round(cv, 3)
}
))
return issues
# ==================== 辅助方法 ====================
def _parse_ci(self, text: str) -> Optional[Tuple[float, float]]:
"""
解析 CI 字符串,支持多种格式(终审建议)
支持格式:
- 2.5 (1.1-3.5)
- 2.5 (1.1, 3.5)
- 2.5 [1.1; 3.5]
- 95% CI: 1.1-3.5
- 95% CI 1.1 to 3.5
"""
for pattern in CI_PATTERNS:
match = pattern.search(text)
if match:
try:
lower = float(match.group(1))
upper = float(match.group(2))
if lower < upper: # 基本合理性检查
return lower, upper
except (ValueError, TypeError, IndexError):
continue
# 回退到原始的 CI_PATTERN
match = CI_PATTERN.search(text)
if match:
try:
lower = float(match.group(1))
upper = float(match.group(2))
if lower < upper:
return lower, upper
except (ValueError, TypeError):
pass
return None
def _parse_pvalue(self, text: str) -> Optional[float]:
"""
解析 P 值
处理:
- P=0.05
- P<0.001
- P>0.05
- p值=0.05
"""
match = PVALUE_PATTERN.search(text)
if match:
try:
return float(match.group(1))
except (ValueError, TypeError):
pass
return None
def _estimate_sample_sizes(
self,
table: TableData,
row_idx: int
) -> Tuple[Optional[int], Optional[int]]:
"""
尝试从表格中估计样本量
策略:
1. 查找表头中的 n 值
2. 查找 "(n=XX)" 格式
3. 默认返回 None
"""
data = table.data
header = data[0] if data else []
# 从表头查找 (n=XX) 格式
n_pattern = re.compile(r"\(?\s*n\s*[=:]\s*(\d+)\s*\)?", re.IGNORECASE)
n_values = []
for cell in header:
match = n_pattern.search(cell)
if match:
try:
n_values.append(int(match.group(1)))
except ValueError:
pass
if len(n_values) >= 2:
return n_values[0], n_values[1]
# 如果找不到,返回 None不进行验证
return None, None

View File

@@ -52,6 +52,9 @@ app.add_middleware(
TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service"))
TEMP_DIR.mkdir(parents=True, exist_ok=True)
# 注册 RVW V2.0 数据侦探路由
app.include_router(forensics_router)
# 导入服务模块
from services.pdf_extractor import extract_pdf_pymupdf
from services.pdf_processor import extract_pdf, get_pdf_processing_strategy
@@ -66,6 +69,9 @@ from services.pdf_markdown_processor import PdfMarkdownProcessor, extract_pdf_to
# 新增文档导出服务Markdown → Word
from services.doc_export_service import check_pandoc_available, convert_markdown_to_docx, create_protocol_docx
# 新增RVW V2.0 数据侦探模块
from forensics.api import router as forensics_router
# 兼容nougat 相关(已废弃,保留空实现避免报错)
def check_nougat_available(): return False
def get_nougat_info(): return {"available": False, "reason": "已废弃,使用 pymupdf4llm 替代"}

View File

@@ -12,6 +12,7 @@ python-multipart==0.0.6
pandas>=2.0.0
numpy>=1.24.0
polars>=0.19.0
scipy>=1.11.0 # 统计验证RVW V2.0 数据侦探T检验、卡方检验
# PDF处理 - 使用 pymupdf4llm替代 nougat更轻量
PyMuPDF>=1.24.0 # PDF 核心库(代码中 import fitz 使用)

View File

@@ -15,6 +15,9 @@ pypandoc>=1.13 # Markdown → Docx (需要系统安装 pandoc)
# Excel/CSV处理
pandas>=2.0.0 # 表格处理
openpyxl>=3.1.2 # Excel 读取
# 统计验证 (RVW V2.0 数据侦探)
scipy>=1.11.0 # T检验、卡方检验逆向计算
tabulate>=0.9.0 # DataFrame → Markdown
# PPT处理

View File

@@ -0,0 +1,245 @@
"""
Day 6 验证器测试脚本
测试内容:
1. T 检验逆向验证
2. SE 三角验证
3. SD > Mean 检查
4. CI vs P 值逻辑检查
"""
import sys
from pathlib import Path
# 添加项目路径
sys.path.insert(0, str(Path(__file__).parent))
from forensics.types import ForensicsConfig, TableData, Severity
from forensics.validator import StatValidator, SCIPY_AVAILABLE
print("=" * 60)
print("Day 6 验证器测试")
print("=" * 60)
print(f"scipy 可用: {SCIPY_AVAILABLE}")
print()
def create_mock_table(table_id: str, data: list[list[str]], caption: str = "") -> TableData:
"""创建模拟表格数据"""
return TableData(
id=table_id,
caption=caption,
row_count=len(data),
col_count=len(data[0]) if data else 0,
html="<table></table>",
data=data,
issues=[],
skipped=False
)
def test_ci_pvalue_consistency():
"""测试 CI vs P 值逻辑一致性检查"""
print("=" * 40)
print("测试 1: CI vs P 值逻辑一致性")
print("=" * 40)
config = ForensicsConfig(check_level="L1_L2")
validator = StatValidator(config)
# 测试数据CI 跨越 1 但 P < 0.05(矛盾)
data_conflict1 = [
["Variable", "OR", "95% CI", "P value"],
["Age", "1.2", "(0.8-1.5)", "P=0.03"], # CI 跨越 1但 P < 0.05,矛盾
]
table1 = create_mock_table("test_ci_1", data_conflict1, "CI 矛盾测试 1")
issues1 = validator._validate_ci_pvalue_consistency(table1)
print(f" 测试数据: CI=0.8-1.5 (跨越1), P=0.03 (显著)")
print(f" 期望: 发现 ERROR")
print(f" 结果: {len(issues1)} 个问题")
if issues1:
print(f" - {issues1[0].severity.value}: {issues1[0].message}")
print()
# 测试数据CI 不跨越 1 且 P < 0.05(正确)
data_correct = [
["Variable", "OR", "95% CI", "P value"],
["Smoking", "2.5", "(1.2-4.8)", "P=0.01"], # CI 不跨越 1P < 0.05,正确
]
table2 = create_mock_table("test_ci_2", data_correct, "CI 正确测试")
issues2 = validator._validate_ci_pvalue_consistency(table2)
print(f" 测试数据: CI=1.2-4.8 (不跨越1), P=0.01 (显著)")
print(f" 期望: 无问题")
print(f" 结果: {len(issues2)} 个问题")
print()
return len(issues1) > 0 and len(issues2) == 0
def test_se_triangle():
"""测试 SE 三角验证"""
print("=" * 40)
print("测试 2: SE 三角验证 (OR/CI/P 一致性)")
print("=" * 40)
if not SCIPY_AVAILABLE:
print(" 跳过: scipy 不可用")
return True
config = ForensicsConfig(check_level="L1_L2")
validator = StatValidator(config)
# 测试数据OR=2.5, CI=1.5-4.2, P=0.001
# 根据 SE 三角公式验证
# SE = (ln(4.2) - ln(1.5)) / 3.92 = (1.435 - 0.405) / 3.92 = 0.263
# Z = ln(2.5) / 0.263 = 0.916 / 0.263 = 3.48
# P = 2 * (1 - norm.cdf(3.48)) ≈ 0.0005
data_consistent = [
["Variable", "OR (95% CI)", "P value"],
["Diabetes", "OR=2.5 (1.5-4.2)", "P=0.001"], # 应该一致
]
table1 = create_mock_table("test_se_1", data_consistent, "SE 三角一致性测试")
issues1 = validator._validate_se_triangle(table1)
print(f" 测试数据: OR=2.5, CI=1.5-4.2, P=0.001")
print(f" 结果: {len(issues1)} 个问题")
for issue in issues1:
print(f" - {issue.severity.value}: {issue.message}")
print()
# 测试数据OR=2.5, CI=1.5-4.2, P=0.5(明显矛盾)
data_conflict = [
["Variable", "OR (95% CI)", "P value"],
["Diabetes", "OR=2.5 (1.5-4.2)", "P=0.5"], # P 值严重矛盾
]
table2 = create_mock_table("test_se_2", data_conflict, "SE 三角矛盾测试")
issues2 = validator._validate_se_triangle(table2)
print(f" 测试数据: OR=2.5, CI=1.5-4.2, P=0.5 (矛盾)")
print(f" 期望: 发现 ERROR")
print(f" 结果: {len(issues2)} 个问题")
for issue in issues2:
print(f" - {issue.severity.value}: {issue.message}")
if issue.evidence:
print(f" 证据: P_calculated={issue.evidence.get('p_calculated')}, P_reported={issue.evidence.get('p_reported')}")
print()
return len(issues2) > 0
def test_sd_greater_mean():
"""测试 SD > Mean 检查"""
print("=" * 40)
print("测试 3: SD > Mean 启发式检查")
print("=" * 40)
config = ForensicsConfig(check_level="L1_L2")
validator = StatValidator(config)
# 测试数据:年龄 SD > Mean明显异常
data_abnormal = [
["Variable", "Group A", "Group B"],
["Age (years)", "25.0 ± 30.0", "28.0 ± 8.5"], # 第一个 SD > Mean
]
table1 = create_mock_table("test_sd_1", data_abnormal, "SD > Mean 异常测试")
issues1 = validator._validate_sd_greater_mean(table1)
print(f" 测试数据: 年龄 = 25.0 ± 30.0 (SD > Mean)")
print(f" 期望: 发现 ERROR (年龄是正值指标)")
print(f" 结果: {len(issues1)} 个问题")
for issue in issues1:
print(f" - {issue.severity.value}: {issue.message}")
print()
# 测试数据:正常情况
data_normal = [
["Variable", "Group A", "Group B"],
["Age (years)", "45.0 ± 12.0", "48.0 ± 10.5"], # 正常
]
table2 = create_mock_table("test_sd_2", data_normal, "SD 正常测试")
issues2 = validator._validate_sd_greater_mean(table2)
print(f" 测试数据: 年龄 = 45.0 ± 12.0 (正常)")
print(f" 期望: 无问题")
print(f" 结果: {len(issues2)} 个问题")
print()
return len(issues1) > 0 and len(issues2) == 0
def test_ttest_validation():
"""测试 T 检验逆向验证"""
print("=" * 40)
print("测试 4: T 检验逆向验证")
print("=" * 40)
if not SCIPY_AVAILABLE:
print(" 跳过: scipy 不可用")
return True
config = ForensicsConfig(check_level="L1_L2")
validator = StatValidator(config)
# 测试数据:包含样本量的表头
# 真实 t 检验M1=45, SD1=10, n1=50; M2=50, SD2=12, n2=48
# t = (50-45) / sqrt(10²/50 + 12²/48) = 5 / sqrt(2 + 3) = 5/2.24 = 2.23
# P ≈ 0.028
data_with_n = [
["Variable", "Group A (n=50)", "Group B (n=48)", "P value"],
["Score", "45.0 ± 10.0", "50.0 ± 12.0", "P=0.03"], # 接近正确
]
table1 = create_mock_table("test_t_1", data_with_n, "T 检验测试")
issues1 = validator._validate_ttest(table1)
print(f" 测试数据: Group A: 45.0±10.0 (n=50), Group B: 50.0±12.0 (n=48), P=0.03")
print(f" 结果: {len(issues1)} 个问题")
for issue in issues1:
print(f" - {issue.severity.value}: {issue.message}")
print()
return True
def run_all_tests():
"""运行所有测试"""
results = []
results.append(("CI vs P 值一致性", test_ci_pvalue_consistency()))
results.append(("SE 三角验证", test_se_triangle()))
results.append(("SD > Mean 检查", test_sd_greater_mean()))
results.append(("T 检验逆向验证", test_ttest_validation()))
print("=" * 60)
print("测试结果汇总")
print("=" * 60)
all_passed = True
for name, passed in results:
status = "✅ PASS" if passed else "❌ FAIL"
print(f" {name}: {status}")
if not passed:
all_passed = False
print()
if all_passed:
print("🎉 所有测试通过Day 6 验证器实现完成。")
else:
print("⚠️ 部分测试失败,请检查代码。")
return all_passed
if __name__ == "__main__":
success = run_all_tests()
sys.exit(0 if success else 1)

View File

@@ -0,0 +1,187 @@
"""
数据侦探模块测试脚本
测试 forensics 模块的表格提取和验证功能。
"""
import os
import sys
from pathlib import Path
# 添加项目路径
sys.path.insert(0, str(Path(__file__).parent))
from forensics.types import ForensicsConfig
from forensics.extractor import DocxTableExtractor
from forensics.validator import ArithmeticValidator, StatValidator
from forensics.config import detect_methods
# 测试文件目录
TEST_DOCS_DIR = Path(__file__).parent.parent / "docs" / "03-业务模块" / "RVW-稿件审查系统" / "05-测试文档"
def test_single_file(file_path: Path) -> dict:
"""测试单个文件"""
print(f"\n{'='*60}")
print(f"📄 测试文件: {file_path.name}")
print(f" 大小: {file_path.stat().st_size / 1024:.1f} KB")
print(f"{'='*60}")
# 创建配置
config = ForensicsConfig(
check_level="L1_L2",
tolerance_percent=0.1,
max_table_rows=500
)
# 提取表格
extractor = DocxTableExtractor(config)
try:
tables, full_text = extractor.extract(str(file_path))
except Exception as e:
print(f"❌ 提取失败: {e}")
return {"success": False, "error": str(e)}
print(f"\n📊 提取结果:")
print(f" - 表格数量: {len(tables)}")
print(f" - 全文长度: {len(full_text)} 字符")
# 检测统计方法
methods = detect_methods(full_text)
print(f" - 检测到的统计方法: {methods if methods else ''}")
# 显示表格信息
for table in tables:
print(f"\n 📋 表格 {table.id}:")
print(f" - Caption: {table.caption[:50] if table.caption else ''}...")
print(f" - 类型: {table.type}")
print(f" - 大小: {table.row_count}× {table.col_count}")
print(f" - 跳过: {table.skipped}")
# 显示前 3 行数据预览
if table.data and not table.skipped:
print(f" - 数据预览 (前 3 行):")
for i, row in enumerate(table.data[:3]):
row_preview = " | ".join([str(cell)[:15] for cell in row[:4]])
print(f" Row {i+1}: {row_preview}...")
# L1 算术验证
print(f"\n🔍 L1 算术验证:")
arithmetic_validator = ArithmeticValidator(config)
for table in tables:
if not table.skipped:
arithmetic_validator.validate(table)
# L2 统计验证
print(f"🔬 L2 统计验证:")
stat_validator = StatValidator(config)
for table in tables:
if not table.skipped:
stat_validator.validate(table, full_text)
# 统计问题
total_issues = 0
error_count = 0
warning_count = 0
for table in tables:
for issue in table.issues:
total_issues += 1
if issue.severity.value == "ERROR":
error_count += 1
elif issue.severity.value == "WARNING":
warning_count += 1
# 显示问题详情
print(f"\n ⚠️ [{issue.severity.value}] {issue.type.value}")
print(f" 位置: {issue.location.cell_ref if issue.location else 'N/A'}")
print(f" 描述: {issue.message}")
if issue.evidence:
print(f" 证据: {issue.evidence}")
print(f"\n📈 统计:")
print(f" - 总问题数: {total_issues}")
print(f" - ERROR: {error_count}")
print(f" - WARNING: {warning_count}")
# 显示 HTML 预览(第一个表格)
if tables and not tables[0].skipped:
html_preview = tables[0].html[:500] if len(tables[0].html) > 500 else tables[0].html
print(f"\n📝 HTML 预览 (表格 0):")
print(html_preview)
print("...")
return {
"success": True,
"file": file_path.name,
"tables": len(tables),
"methods": methods,
"total_issues": total_issues,
"error_count": error_count,
"warning_count": warning_count
}
def main():
"""主测试函数"""
print("=" * 70)
print("🔬 RVW V2.0 数据侦探模块测试")
print("=" * 70)
# 检查测试目录
if not TEST_DOCS_DIR.exists():
print(f"❌ 测试目录不存在: {TEST_DOCS_DIR}")
return
# 获取所有 .docx 文件
docx_files = list(TEST_DOCS_DIR.glob("*.docx"))
if not docx_files:
print(f"❌ 测试目录中没有 .docx 文件")
return
print(f"\n📁 测试目录: {TEST_DOCS_DIR}")
print(f"📄 找到 {len(docx_files)} 个测试文件")
# 测试每个文件
results = []
for file_path in docx_files:
try:
result = test_single_file(file_path)
results.append(result)
except Exception as e:
print(f"\n❌ 测试 {file_path.name} 时出错: {e}")
import traceback
traceback.print_exc()
results.append({
"success": False,
"file": file_path.name,
"error": str(e)
})
# 汇总结果
print("\n" + "=" * 70)
print("📊 测试汇总")
print("=" * 70)
success_count = sum(1 for r in results if r.get("success"))
total_tables = sum(r.get("tables", 0) for r in results if r.get("success"))
total_issues = sum(r.get("total_issues", 0) for r in results if r.get("success"))
total_errors = sum(r.get("error_count", 0) for r in results if r.get("success"))
print(f"\n✅ 成功: {success_count}/{len(results)}")
print(f"📋 总表格数: {total_tables}")
print(f"⚠️ 总问题数: {total_issues} (ERROR: {total_errors})")
print("\n📝 详细结果:")
for r in results:
status = "" if r.get("success") else ""
print(f" {status} {r.get('file', 'Unknown')}")
if r.get("success"):
print(f" 表格: {r.get('tables', 0)}, 问题: {r.get('total_issues', 0)}, 方法: {r.get('methods', [])}")
else:
print(f" 错误: {r.get('error', 'Unknown')}")
if __name__ == "__main__":
main()