feat(rvw): Implement RVW V2.0 Data Forensics Module - Day 6 StatValidator
Summary: - Implement L2 Statistical Validator (CI-P consistency, T-test reverse) - Implement L2.5 Consistency Forensics (SE Triangle, SD>Mean check) - Add error/warning severity classification with tolerance thresholds - Support 5+ CI formats parsing (parentheses, brackets, 95% CI prefix) - Complete Python forensics service (types, config, validator, extractor) V2.0 Development Progress (Week 2 Day 6): - Day 1-5: Python service setup, Word table extraction, L1 arithmetic validator - Day 6: L2 StatValidator + L2.5 consistency forensics (promoted from V2.1) Test Results: - Unit tests: 4/4 passed (CI-P, SE Triangle, SD>Mean, T-test) - Real document tests: 5/5 successful, 2 reasonable WARNINGs Status: Day 6 completed, ready for Day 7 (Skills Framework) Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
221
extraction_service/forensics/api.py
Normal file
221
extraction_service/forensics/api.py
Normal file
@@ -0,0 +1,221 @@
|
||||
"""
|
||||
数据侦探模块 - FastAPI 路由
|
||||
|
||||
提供 /api/v1/forensics/* 接口
|
||||
|
||||
API 端点:
|
||||
- GET /api/v1/forensics/health - 健康检查
|
||||
- POST /api/v1/forensics/analyze_docx - 分析 Word 文档
|
||||
- GET /api/v1/forensics/supported_formats - 获取支持的格式
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, File, UploadFile, HTTPException
|
||||
from fastapi.responses import JSONResponse
|
||||
from loguru import logger
|
||||
from pathlib import Path
|
||||
import os
|
||||
import time
|
||||
|
||||
from .types import ForensicsConfig, ForensicsResult, Severity
|
||||
from .config import (
|
||||
validate_file_size,
|
||||
validate_file_extension,
|
||||
detect_methods,
|
||||
MAX_FILE_SIZE_BYTES,
|
||||
ALLOWED_EXTENSIONS,
|
||||
)
|
||||
from .extractor import DocxTableExtractor
|
||||
from .validator import ArithmeticValidator, StatValidator
|
||||
|
||||
# 创建路由器
|
||||
router = APIRouter(prefix="/api/v1/forensics", tags=["forensics"])
|
||||
|
||||
# 临时文件目录
|
||||
TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service"))
|
||||
TEMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
@router.get("/health")
|
||||
async def forensics_health():
|
||||
"""
|
||||
数据侦探模块健康检查
|
||||
"""
|
||||
try:
|
||||
# 检查依赖
|
||||
import docx
|
||||
import pandas
|
||||
import scipy
|
||||
|
||||
return {
|
||||
"status": "healthy",
|
||||
"module": "forensics",
|
||||
"version": "2.0.0",
|
||||
"dependencies": {
|
||||
"python-docx": docx.__version__ if hasattr(docx, '__version__') else "unknown",
|
||||
"pandas": pandas.__version__,
|
||||
"scipy": scipy.__version__,
|
||||
}
|
||||
}
|
||||
except ImportError as e:
|
||||
return {
|
||||
"status": "degraded",
|
||||
"module": "forensics",
|
||||
"error": f"Missing dependency: {e}"
|
||||
}
|
||||
|
||||
|
||||
@router.post("/analyze_docx")
|
||||
async def analyze_docx(
|
||||
file: UploadFile = File(...),
|
||||
check_level: str = "L1_L2",
|
||||
tolerance_percent: float = 0.1,
|
||||
max_table_rows: int = 500
|
||||
):
|
||||
"""
|
||||
分析 Word 文档表格数据
|
||||
|
||||
Args:
|
||||
file: 上传的 .docx 文件
|
||||
check_level: 验证级别 (L1 / L1_L2)
|
||||
tolerance_percent: 百分比容错范围
|
||||
max_table_rows: 单表最大行数
|
||||
|
||||
Returns:
|
||||
ForensicsResult: 分析结果,包含表格、HTML、问题列表
|
||||
"""
|
||||
temp_path = None
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# 1. 验证文件扩展名
|
||||
is_valid, error_msg = validate_file_extension(file.filename)
|
||||
if not is_valid:
|
||||
logger.warning(f"文件格式校验失败: {file.filename} - {error_msg}")
|
||||
raise HTTPException(status_code=400, detail=error_msg)
|
||||
|
||||
# 2. 读取文件内容
|
||||
content = await file.read()
|
||||
file_size = len(content)
|
||||
|
||||
# 3. 验证文件大小
|
||||
is_valid, error_msg = validate_file_size(file_size)
|
||||
if not is_valid:
|
||||
logger.warning(f"文件大小校验失败: {file.filename} - {error_msg}")
|
||||
raise HTTPException(status_code=400, detail=error_msg)
|
||||
|
||||
logger.info(f"开始分析 Word 文档: {file.filename}, 大小: {file_size/1024:.1f}KB")
|
||||
|
||||
# 4. 保存临时文件
|
||||
temp_path = TEMP_DIR / f"forensics_{os.getpid()}_{file.filename}"
|
||||
with open(temp_path, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
# 5. 创建配置
|
||||
config = ForensicsConfig(
|
||||
check_level=check_level,
|
||||
tolerance_percent=tolerance_percent,
|
||||
max_table_rows=max_table_rows
|
||||
)
|
||||
|
||||
# 6. 提取表格
|
||||
extractor = DocxTableExtractor(config)
|
||||
tables, full_text = extractor.extract(str(temp_path))
|
||||
|
||||
# 7. 检测统计方法
|
||||
methods_found = detect_methods(full_text)
|
||||
logger.info(f"检测到统计方法: {methods_found}")
|
||||
|
||||
# 8. L1 算术验证
|
||||
arithmetic_validator = ArithmeticValidator(config)
|
||||
for table in tables:
|
||||
if not table.skipped:
|
||||
arithmetic_validator.validate(table)
|
||||
|
||||
# 9. L2 统计验证(如果启用)
|
||||
if check_level == "L1_L2":
|
||||
stat_validator = StatValidator(config)
|
||||
for table in tables:
|
||||
if not table.skipped:
|
||||
stat_validator.validate(table, full_text)
|
||||
|
||||
# 10. 统计问题数量
|
||||
total_issues = 0
|
||||
error_count = 0
|
||||
warning_count = 0
|
||||
|
||||
for table in tables:
|
||||
for issue in table.issues:
|
||||
total_issues += 1
|
||||
if issue.severity == Severity.ERROR:
|
||||
error_count += 1
|
||||
elif issue.severity == Severity.WARNING:
|
||||
warning_count += 1
|
||||
|
||||
execution_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
# 11. 构建结果
|
||||
result = ForensicsResult(
|
||||
success=True,
|
||||
methods_found=methods_found,
|
||||
tables=tables,
|
||||
total_issues=total_issues,
|
||||
error_count=error_count,
|
||||
warning_count=warning_count,
|
||||
execution_time_ms=execution_time_ms,
|
||||
error=None,
|
||||
fallback_available=True
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"分析完成: {file.filename}, "
|
||||
f"表格: {len(tables)}, "
|
||||
f"问题: {total_issues} (ERROR: {error_count}, WARNING: {warning_count}), "
|
||||
f"耗时: {execution_time_ms}ms"
|
||||
)
|
||||
|
||||
return JSONResponse(content=result.model_dump())
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"分析失败: {file.filename} - {str(e)}")
|
||||
|
||||
execution_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
# 返回失败结果(支持降级)
|
||||
result = ForensicsResult(
|
||||
success=False,
|
||||
methods_found=[],
|
||||
tables=[],
|
||||
total_issues=0,
|
||||
error_count=0,
|
||||
warning_count=0,
|
||||
execution_time_ms=execution_time_ms,
|
||||
error=str(e),
|
||||
fallback_available=True
|
||||
)
|
||||
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content=result.model_dump()
|
||||
)
|
||||
|
||||
finally:
|
||||
# 清理临时文件
|
||||
if temp_path and temp_path.exists():
|
||||
try:
|
||||
os.remove(temp_path)
|
||||
except Exception as e:
|
||||
logger.warning(f"清理临时文件失败: {e}")
|
||||
|
||||
|
||||
@router.get("/supported_formats")
|
||||
async def supported_formats():
|
||||
"""
|
||||
获取支持的文件格式
|
||||
"""
|
||||
return {
|
||||
"formats": list(ALLOWED_EXTENSIONS),
|
||||
"max_file_size_mb": MAX_FILE_SIZE_BYTES / 1024 / 1024,
|
||||
"note": "MVP 阶段仅支持 .docx 格式,.doc 文件请先用 Word 另存为 .docx"
|
||||
}
|
||||
Reference in New Issue
Block a user