""" 数据侦探模块 - FastAPI 路由 提供 /api/v1/forensics/* 接口 API 端点: - GET /api/v1/forensics/health - 健康检查 - POST /api/v1/forensics/analyze_docx - 分析 Word 文档 - GET /api/v1/forensics/supported_formats - 获取支持的格式 """ from fastapi import APIRouter, File, UploadFile, HTTPException from fastapi.responses import JSONResponse from loguru import logger from pathlib import Path import os import time from .types import ForensicsConfig, ForensicsResult, Severity from .config import ( validate_file_size, validate_file_extension, detect_methods, MAX_FILE_SIZE_BYTES, ALLOWED_EXTENSIONS, ) from .extractor import DocxTableExtractor from .validator import ArithmeticValidator, StatValidator # 创建路由器 router = APIRouter(prefix="/api/v1/forensics", tags=["forensics"]) # 临时文件目录 TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service")) TEMP_DIR.mkdir(parents=True, exist_ok=True) @router.get("/health") async def forensics_health(): """ 数据侦探模块健康检查 """ try: # 检查依赖 import docx import pandas import scipy return { "status": "healthy", "module": "forensics", "version": "2.0.0", "dependencies": { "python-docx": docx.__version__ if hasattr(docx, '__version__') else "unknown", "pandas": pandas.__version__, "scipy": scipy.__version__, } } except ImportError as e: return { "status": "degraded", "module": "forensics", "error": f"Missing dependency: {e}" } @router.post("/analyze_docx") async def analyze_docx( file: UploadFile = File(...), check_level: str = "L1_L2", tolerance_percent: float = 0.1, max_table_rows: int = 500 ): """ 分析 Word 文档表格数据 Args: file: 上传的 .docx 文件 check_level: 验证级别 (L1 / L1_L2) tolerance_percent: 百分比容错范围 max_table_rows: 单表最大行数 Returns: ForensicsResult: 分析结果,包含表格、HTML、问题列表 """ temp_path = None start_time = time.time() try: # 1. 验证文件扩展名 is_valid, error_msg = validate_file_extension(file.filename) if not is_valid: logger.warning(f"文件格式校验失败: {file.filename} - {error_msg}") raise HTTPException(status_code=400, detail=error_msg) # 2. 读取文件内容 content = await file.read() file_size = len(content) # 3. 验证文件大小 is_valid, error_msg = validate_file_size(file_size) if not is_valid: logger.warning(f"文件大小校验失败: {file.filename} - {error_msg}") raise HTTPException(status_code=400, detail=error_msg) logger.info(f"开始分析 Word 文档: {file.filename}, 大小: {file_size/1024:.1f}KB") # 4. 保存临时文件 temp_path = TEMP_DIR / f"forensics_{os.getpid()}_{file.filename}" with open(temp_path, "wb") as f: f.write(content) # 5. 创建配置 config = ForensicsConfig( check_level=check_level, tolerance_percent=tolerance_percent, max_table_rows=max_table_rows ) # 6. 提取表格 extractor = DocxTableExtractor(config) tables, full_text = extractor.extract(str(temp_path)) # 7. 检测统计方法 methods_found = detect_methods(full_text) logger.info(f"检测到统计方法: {methods_found}") # 8. L1 算术验证 arithmetic_validator = ArithmeticValidator(config) for table in tables: if not table.skipped: arithmetic_validator.validate(table) # 9. L2 统计验证(如果启用) if check_level == "L1_L2": stat_validator = StatValidator(config) for table in tables: if not table.skipped: stat_validator.validate(table, full_text) # 10. 统计问题数量 total_issues = 0 error_count = 0 warning_count = 0 for table in tables: for issue in table.issues: total_issues += 1 if issue.severity == Severity.ERROR: error_count += 1 elif issue.severity == Severity.WARNING: warning_count += 1 execution_time_ms = int((time.time() - start_time) * 1000) # 11. 构建结果 result = ForensicsResult( success=True, methods_found=methods_found, tables=tables, total_issues=total_issues, error_count=error_count, warning_count=warning_count, execution_time_ms=execution_time_ms, error=None, fallback_available=True ) logger.info( f"分析完成: {file.filename}, " f"表格: {len(tables)}, " f"问题: {total_issues} (ERROR: {error_count}, WARNING: {warning_count}), " f"耗时: {execution_time_ms}ms" ) return JSONResponse(content=result.model_dump(by_alias=True)) except HTTPException: raise except Exception as e: logger.error(f"分析失败: {file.filename} - {str(e)}") execution_time_ms = int((time.time() - start_time) * 1000) # 返回失败结果(支持降级) result = ForensicsResult( success=False, methods_found=[], tables=[], total_issues=0, error_count=0, warning_count=0, execution_time_ms=execution_time_ms, error=str(e), fallback_available=True ) return JSONResponse( status_code=500, content=result.model_dump() ) finally: # 清理临时文件 if temp_path and temp_path.exists(): try: os.remove(temp_path) except Exception as e: logger.warning(f"清理临时文件失败: {e}") @router.get("/supported_formats") async def supported_formats(): """ 获取支持的文件格式 """ return { "formats": list(ALLOWED_EXTENSIONS), "max_file_size_mb": MAX_FILE_SIZE_BYTES / 1024 / 1024, "note": "MVP 阶段仅支持 .docx 格式,.doc 文件请先用 Word 另存为 .docx" }