Files
AIclinicalresearch/extraction_service/forensics/api.py
HaHafeng ba464082cb feat(core): finalize rvw stability updates and pending module changes
Summary:
- Harden RVW prompt protocol handling and methodology review flow with 20-checkpoint coverage, divide-and-conquer execution, and timeout tuning
- Update RVW frontend methodology report rendering to show real structured outputs and grouped checkpoint sections
- Include pending backend/frontend updates across IIT admin, SSA, extraction forensics, and related integration files
- Sync system and RVW status documentation, deployment checklist, and RVW architecture/plan docs

Validation:
- Verified lint diagnostics for touched RVW backend/frontend files show no new errors
- Kept backup dump files and local test artifacts untracked

Made-with: Cursor
2026-03-14 00:00:04 +08:00

225 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
数据侦探模块 - FastAPI 路由
提供 /api/v1/forensics/* 接口
API 端点:
- GET /api/v1/forensics/health - 健康检查
- POST /api/v1/forensics/analyze_docx - 分析 Word 文档
- GET /api/v1/forensics/supported_formats - 获取支持的格式
"""
from fastapi import APIRouter, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
from loguru import logger
from pathlib import Path
import os
import time
from .types import ForensicsConfig, ForensicsResult, Severity
from .config import (
validate_file_size,
validate_file_extension,
detect_methods,
MAX_FILE_SIZE_BYTES,
ALLOWED_EXTENSIONS,
)
from .extractor import DocxTableExtractor
from .validator import ArithmeticValidator, StatValidator
# 创建路由器
router = APIRouter(prefix="/api/v1/forensics", tags=["forensics"])
# 临时文件目录
TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service"))
TEMP_DIR.mkdir(parents=True, exist_ok=True)
@router.get("/health")
async def forensics_health():
"""
数据侦探模块健康检查
"""
try:
# 检查依赖
import docx
import pandas
import scipy
return {
"status": "healthy",
"module": "forensics",
"version": "2.0.0",
"dependencies": {
"python-docx": docx.__version__ if hasattr(docx, '__version__') else "unknown",
"pandas": pandas.__version__,
"scipy": scipy.__version__,
}
}
except ImportError as e:
return {
"status": "degraded",
"module": "forensics",
"error": f"Missing dependency: {e}"
}
@router.post("/analyze_docx")
async def analyze_docx(
file: UploadFile = File(...),
check_level: str = "EXTRACT_ONLY",
tolerance_percent: float = 0.1,
max_table_rows: int = 500
):
"""
分析 Word 文档表格数据
Args:
file: 上传的 .docx 文件
check_level: 验证级别 (EXTRACT_ONLY / L1 / L1_L2)
tolerance_percent: 百分比容错范围
max_table_rows: 单表最大行数
Returns:
ForensicsResult: 分析结果包含表格、HTML、问题列表
"""
temp_path = None
start_time = time.time()
try:
# 1. 验证文件扩展名
is_valid, error_msg = validate_file_extension(file.filename)
if not is_valid:
logger.warning(f"文件格式校验失败: {file.filename} - {error_msg}")
raise HTTPException(status_code=400, detail=error_msg)
# 2. 读取文件内容
content = await file.read()
file_size = len(content)
# 3. 验证文件大小
is_valid, error_msg = validate_file_size(file_size)
if not is_valid:
logger.warning(f"文件大小校验失败: {file.filename} - {error_msg}")
raise HTTPException(status_code=400, detail=error_msg)
logger.info(f"开始分析 Word 文档: {file.filename}, 大小: {file_size/1024:.1f}KB")
# 4. 保存临时文件
temp_path = TEMP_DIR / f"forensics_{os.getpid()}_{file.filename}"
with open(temp_path, "wb") as f:
f.write(content)
# 5. 创建配置
config = ForensicsConfig(
check_level=check_level,
tolerance_percent=tolerance_percent,
max_table_rows=max_table_rows
)
# 6. 提取表格
extractor = DocxTableExtractor(config)
tables, full_text = extractor.extract(str(temp_path))
# 7. 检测统计方法
methods_found = detect_methods(full_text)
logger.info(f"检测到统计方法: {methods_found}")
# 8. 规则验证(可关闭)
# EXTRACT_ONLY仅提取表格不执行任何规则验证
if check_level != "EXTRACT_ONLY":
# L1 算术验证
arithmetic_validator = ArithmeticValidator(config)
for table in tables:
if not table.skipped:
arithmetic_validator.validate(table)
# L2 统计验证(如果启用)
if check_level == "L1_L2":
stat_validator = StatValidator(config)
for table in tables:
if not table.skipped:
stat_validator.validate(table, full_text)
# 10. 统计问题数量
total_issues = 0
error_count = 0
warning_count = 0
for table in tables:
for issue in table.issues:
total_issues += 1
if issue.severity == Severity.ERROR:
error_count += 1
elif issue.severity == Severity.WARNING:
warning_count += 1
execution_time_ms = int((time.time() - start_time) * 1000)
# 11. 构建结果
result = ForensicsResult(
success=True,
methods_found=methods_found,
tables=tables,
total_issues=total_issues,
error_count=error_count,
warning_count=warning_count,
execution_time_ms=execution_time_ms,
error=None,
fallback_available=True
)
logger.info(
f"分析完成: {file.filename}, "
f"表格: {len(tables)}, "
f"问题: {total_issues} (ERROR: {error_count}, WARNING: {warning_count}), "
f"耗时: {execution_time_ms}ms"
)
return JSONResponse(content=result.model_dump(by_alias=True))
except HTTPException:
raise
except Exception as e:
logger.error(f"分析失败: {file.filename} - {str(e)}")
execution_time_ms = int((time.time() - start_time) * 1000)
# 返回失败结果(支持降级)
result = ForensicsResult(
success=False,
methods_found=[],
tables=[],
total_issues=0,
error_count=0,
warning_count=0,
execution_time_ms=execution_time_ms,
error=str(e),
fallback_available=True
)
return JSONResponse(
status_code=500,
content=result.model_dump()
)
finally:
# 清理临时文件
if temp_path and temp_path.exists():
try:
os.remove(temp_path)
except Exception as e:
logger.warning(f"清理临时文件失败: {e}")
@router.get("/supported_formats")
async def supported_formats():
"""
获取支持的文件格式
"""
return {
"formats": list(ALLOWED_EXTENSIONS),
"max_file_size_mb": MAX_FILE_SIZE_BYTES / 1024 / 1024,
"note": "MVP 阶段仅支持 .docx 格式,.doc 文件请先用 Word 另存为 .docx"
}