feat(rvw): Implement RVW V2.0 Data Forensics Module - Day 6 StatValidator

Summary: - Implement L2 Statistical Validator (CI-P consistency, T-test reverse) - Implement L2.5 Consistency Forensics (SE Triangle, SD>Mean check) - Add error/warning severity classification with tolerance thresholds - Support 5+ CI formats parsing (parentheses, brackets, 95% CI prefix) - Complete Python forensics service (types, config, validator, extractor) V2.0 Development Progress (Week 2 Day 6): - Day 1-5: Python service setup, Word table extraction, L1 arithmetic validator - Day 6: L2 StatValidator + L2.5 consistency forensics (promoted from V2.1) Test Results: - Unit tests: 4/4 passed (CI-P, SE Triangle, SD>Mean, T-test) - Real document tests: 5/5 successful, 2 reasonable WARNINGs Status: Day 6 completed, ready for Day 7 (Skills Framework) Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-17 22:15:27 +08:00
parent 7a299e8562
commit e785969e54
31 changed files with 5925 additions and 15 deletions
--- a/extraction_service/forensics/api.py
+++ b/extraction_service/forensics/api.py
@@ -0,0 +1,221 @@
+"""
+数据侦探模块 - FastAPI 路由
+
+提供 /api/v1/forensics/* 接口
+
+API 端点：
+- GET  /api/v1/forensics/health - 健康检查
+- POST /api/v1/forensics/analyze_docx - 分析 Word 文档
+- GET  /api/v1/forensics/supported_formats - 获取支持的格式
+"""
+
+from fastapi import APIRouter, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+from loguru import logger
+from pathlib import Path
+import os
+import time
+
+from .types import ForensicsConfig, ForensicsResult, Severity
+from .config import (
+    validate_file_size,
+    validate_file_extension,
+    detect_methods,
+    MAX_FILE_SIZE_BYTES,
+    ALLOWED_EXTENSIONS,
+)
+from .extractor import DocxTableExtractor
+from .validator import ArithmeticValidator, StatValidator
+
+# 创建路由器
+router = APIRouter(prefix="/api/v1/forensics", tags=["forensics"])
+
+# 临时文件目录
+TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service"))
+TEMP_DIR.mkdir(parents=True, exist_ok=True)
+
+
+@router.get("/health")
+async def forensics_health():
+    """
+    数据侦探模块健康检查
+    """
+    try:
+        # 检查依赖
+        import docx
+        import pandas
+        import scipy
+        
+        return {
+            "status": "healthy",
+            "module": "forensics",
+            "version": "2.0.0",
+            "dependencies": {
+                "python-docx": docx.__version__ if hasattr(docx, '__version__') else "unknown",
+                "pandas": pandas.__version__,
+                "scipy": scipy.__version__,
+            }
+        }
+    except ImportError as e:
+        return {
+            "status": "degraded",
+            "module": "forensics",
+            "error": f"Missing dependency: {e}"
+        }
+
+
+@router.post("/analyze_docx")
+async def analyze_docx(
+    file: UploadFile = File(...),
+    check_level: str = "L1_L2",
+    tolerance_percent: float = 0.1,
+    max_table_rows: int = 500
+):
+    """
+    分析 Word 文档表格数据
+    
+    Args:
+        file: 上传的 .docx 文件
+        check_level: 验证级别 (L1 / L1_L2)
+        tolerance_percent: 百分比容错范围
+        max_table_rows: 单表最大行数
+    
+    Returns:
+        ForensicsResult: 分析结果，包含表格、HTML、问题列表
+    """
+    temp_path = None
+    start_time = time.time()
+    
+    try:
+        # 1. 验证文件扩展名
+        is_valid, error_msg = validate_file_extension(file.filename)
+        if not is_valid:
+            logger.warning(f"文件格式校验失败: {file.filename} - {error_msg}")
+            raise HTTPException(status_code=400, detail=error_msg)
+        
+        # 2. 读取文件内容
+        content = await file.read()
+        file_size = len(content)
+        
+        # 3. 验证文件大小
+        is_valid, error_msg = validate_file_size(file_size)
+        if not is_valid:
+            logger.warning(f"文件大小校验失败: {file.filename} - {error_msg}")
+            raise HTTPException(status_code=400, detail=error_msg)
+        
+        logger.info(f"开始分析 Word 文档: {file.filename}, 大小: {file_size/1024:.1f}KB")
+        
+        # 4. 保存临时文件
+        temp_path = TEMP_DIR / f"forensics_{os.getpid()}_{file.filename}"
+        with open(temp_path, "wb") as f:
+            f.write(content)
+        
+        # 5. 创建配置
+        config = ForensicsConfig(
+            check_level=check_level,
+            tolerance_percent=tolerance_percent,
+            max_table_rows=max_table_rows
+        )
+        
+        # 6. 提取表格
+        extractor = DocxTableExtractor(config)
+        tables, full_text = extractor.extract(str(temp_path))
+        
+        # 7. 检测统计方法
+        methods_found = detect_methods(full_text)
+        logger.info(f"检测到统计方法: {methods_found}")
+        
+        # 8. L1 算术验证
+        arithmetic_validator = ArithmeticValidator(config)
+        for table in tables:
+            if not table.skipped:
+                arithmetic_validator.validate(table)
+        
+        # 9. L2 统计验证（如果启用）
+        if check_level == "L1_L2":
+            stat_validator = StatValidator(config)
+            for table in tables:
+                if not table.skipped:
+                    stat_validator.validate(table, full_text)
+        
+        # 10. 统计问题数量
+        total_issues = 0
+        error_count = 0
+        warning_count = 0
+        
+        for table in tables:
+            for issue in table.issues:
+                total_issues += 1
+                if issue.severity == Severity.ERROR:
+                    error_count += 1
+                elif issue.severity == Severity.WARNING:
+                    warning_count += 1
+        
+        execution_time_ms = int((time.time() - start_time) * 1000)
+        
+        # 11. 构建结果
+        result = ForensicsResult(
+            success=True,
+            methods_found=methods_found,
+            tables=tables,
+            total_issues=total_issues,
+            error_count=error_count,
+            warning_count=warning_count,
+            execution_time_ms=execution_time_ms,
+            error=None,
+            fallback_available=True
+        )
+        
+        logger.info(
+            f"分析完成: {file.filename}, "
+            f"表格: {len(tables)}, "
+            f"问题: {total_issues} (ERROR: {error_count}, WARNING: {warning_count}), "
+            f"耗时: {execution_time_ms}ms"
+        )
+        
+        return JSONResponse(content=result.model_dump())
+    
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"分析失败: {file.filename} - {str(e)}")
+        
+        execution_time_ms = int((time.time() - start_time) * 1000)
+        
+        # 返回失败结果（支持降级）
+        result = ForensicsResult(
+            success=False,
+            methods_found=[],
+            tables=[],
+            total_issues=0,
+            error_count=0,
+            warning_count=0,
+            execution_time_ms=execution_time_ms,
+            error=str(e),
+            fallback_available=True
+        )
+        
+        return JSONResponse(
+            status_code=500,
+            content=result.model_dump()
+        )
+    
+    finally:
+        # 清理临时文件
+        if temp_path and temp_path.exists():
+            try:
+                os.remove(temp_path)
+            except Exception as e:
+                logger.warning(f"清理临时文件失败: {e}")
+
+
+@router.get("/supported_formats")
+async def supported_formats():
+    """
+    获取支持的文件格式
+    """
+    return {
+        "formats": list(ALLOWED_EXTENSIONS),
+        "max_file_size_mb": MAX_FILE_SIZE_BYTES / 1024 / 1024,
+        "note": "MVP 阶段仅支持 .docx 格式，.doc 文件请先用 Word 另存为 .docx"
+    }