Files
AIclinicalresearch/extraction_service/main.py
HaHafeng e785969e54 feat(rvw): Implement RVW V2.0 Data Forensics Module - Day 6 StatValidator
Summary:
- Implement L2 Statistical Validator (CI-P consistency, T-test reverse)
- Implement L2.5 Consistency Forensics (SE Triangle, SD>Mean check)
- Add error/warning severity classification with tolerance thresholds
- Support 5+ CI formats parsing (parentheses, brackets, 95% CI prefix)
- Complete Python forensics service (types, config, validator, extractor)

V2.0 Development Progress (Week 2 Day 6):
- Day 1-5: Python service setup, Word table extraction, L1 arithmetic validator
- Day 6: L2 StatValidator + L2.5 consistency forensics (promoted from V2.1)

Test Results:
- Unit tests: 4/4 passed (CI-P, SE Triangle, SD>Mean, T-test)
- Real document tests: 5/5 successful, 2 reasonable WARNINGs

Status: Day 6 completed, ready for Day 7 (Skills Framework)
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-17 22:15:27 +08:00

2306 lines
71 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
文档提取微服务 - 主入口
功能:
- PDF文本提取PyMuPDF
- Docx文本提取Mammoth
- Txt文本提取直接读取
- 语言检测
- 健康检查
"""
from fastapi import FastAPI, File, UploadFile, HTTPException, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from typing import List, Dict, Any, Optional
from loguru import logger
from pathlib import Path
import os
import sys
from datetime import datetime
from dotenv import load_dotenv
# 加载环境变量
load_dotenv()
# 配置日志
logger.remove()
logger.add(
sys.stdout,
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
level=os.getenv("LOG_LEVEL", "INFO")
)
# 创建FastAPI应用
app = FastAPI(
title="文档提取微服务",
description="提供PDF、Docx、Txt文档的文本提取服务",
version="1.0.0",
)
# CORS配置
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # 生产环境应该限制具体域名
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# 临时文件目录
TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service"))
TEMP_DIR.mkdir(parents=True, exist_ok=True)
# 注册 RVW V2.0 数据侦探路由
app.include_router(forensics_router)
# 导入服务模块
from services.pdf_extractor import extract_pdf_pymupdf
from services.pdf_processor import extract_pdf, get_pdf_processing_strategy
from services.language_detector import detect_language, detect_language_detailed
from services.file_utils import detect_file_type, cleanup_temp_file
from services.docx_extractor import extract_docx_mammoth, validate_docx_file
from services.txt_extractor import extract_txt, validate_txt_file
from services.dc_executor import validate_code, execute_pandas_code
# 新增统一文档处理器RAG 引擎使用)
from services.document_processor import DocumentProcessor, convert_to_markdown
from services.pdf_markdown_processor import PdfMarkdownProcessor, extract_pdf_to_markdown
# 新增文档导出服务Markdown → Word
from services.doc_export_service import check_pandoc_available, convert_markdown_to_docx, create_protocol_docx
# 新增RVW V2.0 数据侦探模块
from forensics.api import router as forensics_router
# 兼容nougat 相关(已废弃,保留空实现避免报错)
def check_nougat_available(): return False
def get_nougat_info(): return {"available": False, "reason": "已废弃,使用 pymupdf4llm 替代"}
# ✨ 导入预写的数据操作函数
from operations.filter import apply_filter
from operations.recode import apply_recode
from operations.binning import apply_binning
from operations.conditional import apply_conditional_column, apply_simple_binning
from operations.dropna import drop_missing_values, get_missing_summary
from operations.compute import compute_column, get_formula_examples
from operations.pivot import pivot_long_to_wide, get_pivot_preview
from operations.unpivot import apply_unpivot, get_unpivot_preview # ✨ 新增:宽表转长表
from operations.metric_time_transform import (
apply_metric_time_transform,
detect_common_pattern,
preview_metric_time_transform,
detect_metric_groups, # ✨ 多指标自动分组
apply_multi_metric_to_long, # ✨ 多指标转长表方向1
preview_multi_metric_to_long, # ✨ 多指标转换预览方向1
apply_multi_metric_to_matrix, # ✨ 多指标转矩阵方向2
preview_multi_metric_to_matrix # ✨ 多指标转换预览方向2
)
from operations.fillna import fillna_simple, fillna_mice, get_column_missing_stats
# ==================== Pydantic Models ====================
class ValidateCodeRequest(BaseModel):
"""代码验证请求模型"""
code: str
class ExecuteCodeRequest(BaseModel):
"""代码执行请求模型"""
data: List[Dict[str, Any]]
code: str
# ✨ 预写函数请求模型
class FilterRequest(BaseModel):
"""筛选请求模型"""
data: List[Dict[str, Any]]
conditions: List[Dict[str, Any]]
logic: str = 'and'
class RecodeRequest(BaseModel):
"""重编码请求模型"""
data: List[Dict[str, Any]]
column: str
mapping: Dict[Any, Any]
create_new_column: bool = True
new_column_name: str = None
na_handling: str = 'keep' # ✨ 新增NA处理方式keep/map/drop
na_value: Any = None # ✨ 新增NA映射值
class BinningRequest(BaseModel):
"""分箱请求模型"""
data: List[Dict[str, Any]]
column: str
method: str
new_column_name: str
bins: List[Any] = None
labels: List[Any] = None
num_bins: int = 3
na_handling: str = 'keep' # ✨ 新增NA处理方式keep/label/assign
na_label: str = None # ✨ 新增NA标签
na_assign_to: int = None # ✨ 新增NA分配到的组索引
class ConditionalRequest(BaseModel):
"""条件生成列请求模型"""
data: List[Dict[str, Any]]
new_column_name: str
rules: List[Dict[str, Any]]
else_value: Any = None
class DropnaRequest(BaseModel):
"""删除缺失值请求模型"""
data: List[Dict[str, Any]]
method: str # 'row', 'column', 'both'
threshold: float = 0.5
subset: List[str] = None
class ComputeRequest(BaseModel):
"""计算列请求模型"""
data: List[Dict[str, Any]]
new_column_name: str
formula: str
column_mapping: List[Dict[str, str]] = [] # ✨ 新增:列名映射
class PivotRequest(BaseModel):
"""Pivot请求模型"""
data: List[Dict[str, Any]]
index_column: str
pivot_column: str
value_columns: List[str]
aggfunc: str = 'first'
column_mapping: List[Dict[str, str]] = [] # ✨ 列名映射
keep_unused_columns: bool = False # ✨ 是否保留未选择的列
unused_agg_method: str = 'first' # ✨ 未选择列的聚合方式first/mode/mean
original_column_order: List[str] = [] # ✨ 新增:原始列顺序
pivot_value_order: List[str] = [] # ✨ 新增:透视列值的原始顺序
class UnpivotRequest(BaseModel):
"""Unpivot请求模型宽表转长表"""
data: List[Dict[str, Any]]
id_vars: List[str] # ID列保持不变的列
value_vars: List[str] # 值列(需要转换的列)
var_name: str = '变量' # 变量名列名
value_name: str = '' # 值列名
parse_column_names: bool = False # 是否解析列名
separator: str = '_' # 分隔符
metric_name: Optional[str] = None # 指标列名
time_name: Optional[str] = None # 时间列名
dropna: bool = False # 是否删除缺失值行
class MetricTimeTransformRequest(BaseModel):
"""指标-时间表转换请求模型"""
data: List[Dict[str, Any]]
id_vars: List[str] # ID列保持不变的列
value_vars: List[str] # 值列(同一指标的多个时间点)
metric_name: Optional[str] = None # 指标名称如果为None则自动检测
separator: Optional[str] = None # 分隔符如果为None则自动检测
timepoint_col_name: str = '时间点' # 时间点列名
class MetricTimeDetectRequest(BaseModel):
"""指标-时间表模式检测请求模型"""
value_vars: List[str] # 值列(用于检测模式)
class MultiMetricDetectRequest(BaseModel):
"""多指标分组检测请求模型"""
value_vars: List[str] # 值列(用于检测分组)
separators: Optional[List[str]] = None # 可选的分隔符列表
class MultiMetricToLongRequest(BaseModel):
"""多指标转长表请求模型方向1"""
data: List[Dict[str, Any]]
id_vars: List[str] # ID列
value_vars: List[str] # 值列(多个指标的多个时间点)
separators: Optional[List[str]] = None # 可选的分隔符列表
event_col_name: str = 'Event_Name' # 时间点列名
class MultiMetricToMatrixRequest(BaseModel):
"""多指标转矩阵请求模型方向2"""
data: List[Dict[str, Any]]
id_vars: List[str] # ID列
value_vars: List[str] # 值列(多个指标的多个时间点)
separators: Optional[List[str]] = None # 可选的分隔符列表
metric_col_name: str = '指标名' # 指标列名
class FillnaStatsRequest(BaseModel):
"""获取列缺失值统计请求模型"""
data: List[Dict[str, Any]]
column: str
class FillnaSimpleRequest(BaseModel):
"""简单填补请求模型"""
data: List[Dict[str, Any]]
column: str
new_column_name: str
method: str # 'mean', 'median', 'mode', 'constant', 'ffill', 'bfill'
fill_value: Any = None
class FillnaMiceRequest(BaseModel):
"""MICE多重插补请求模型"""
data: List[Dict[str, Any]]
columns: List[str]
reference_columns: Optional[List[str]] = None # ⭐ 新增:参考列
n_iterations: int = 10
random_state: int = 42
class MarkdownToDocxRequest(BaseModel):
"""Markdown转Word请求模型"""
content: str # Markdown 内容
use_template: bool = True # 是否使用模板
title: str = "临床研究方案" # 文档标题
class ProtocolToDocxRequest(BaseModel):
"""研究方案转Word请求模型"""
sections: Dict[str, str] # 章节内容
title: str = "临床研究方案" # 文档标题
# ==================== API路由 ====================
@app.get("/")
async def root():
"""根路径"""
return {
"service": "文档提取微服务",
"version": "1.0.0",
"status": "running"
}
@app.get("/api/health")
async def health_check():
"""
健康检查接口
检查项:
- 服务是否运行
- PyMuPDF是否可用
- Nougat是否可用
- 临时目录是否可写
"""
try:
import fitz # PyMuPDF
pymupdf_version = fitz.__version__
pymupdf_available = True
except Exception as e:
pymupdf_version = "unknown"
pymupdf_available = False
logger.warning(f"PyMuPDF不可用: {str(e)}")
# 检查Nougat
nougat_info = get_nougat_info()
# 检查临时目录
temp_dir_writable = TEMP_DIR.exists() and os.access(TEMP_DIR, os.W_OK)
return {
"status": "healthy" if (pymupdf_available and temp_dir_writable) else "degraded",
"checks": {
"pymupdf": {
"available": pymupdf_available,
"version": pymupdf_version
},
"nougat": nougat_info,
"temp_dir": {
"path": str(TEMP_DIR),
"writable": temp_dir_writable
}
},
"timestamp": datetime.now().isoformat()
}
@app.post("/api/extract/pdf")
async def extract_pdf_endpoint(
file: UploadFile = File(...),
method: str = "auto"
):
"""
PDF文本提取接口智能选择方法
Args:
file: 上传的PDF文件
method: 提取方法 ('auto' | 'nougat' | 'pymupdf')
- auto: 自动选择(默认)
- nougat: 强制使用Nougat
- pymupdf: 强制使用PyMuPDF
Returns:
{
"success": true,
"method": "nougat" | "pymupdf",
"reason": "...",
"text": "提取的文本内容",
"metadata": {...}
}
"""
temp_path = None
try:
# 验证文件类型
if not file.filename.lower().endswith('.pdf'):
raise HTTPException(
status_code=400,
detail="文件格式错误只支持PDF文件"
)
# 保存临时文件
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
logger.info(f"开始处理PDF文件: {file.filename}, 方法={method}")
with open(temp_path, "wb") as f:
content = await file.read()
f.write(content)
file_size = len(content)
logger.info(f"文件大小: {file_size / 1024:.2f} KB")
# 提取文本(使用顺序降级策略)
force_method = None if method == "auto" else method
result = extract_pdf(str(temp_path), force_method=force_method)
if not result["success"]:
raise HTTPException(
status_code=500,
detail=f"PDF提取失败: {result.get('error', 'Unknown error')}"
)
# 添加文件元数据
result["metadata"]["file_size"] = file_size
result["metadata"]["filename"] = file.filename
logger.info(f"PDF提取成功: {file.filename}, "
f"方法={result['method']}, "
f"原因={result.get('reason', 'N/A')}")
return JSONResponse(content=result)
except HTTPException:
raise
except Exception as e:
logger.error(f"PDF提取失败: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"处理失败: {str(e)}"
)
finally:
# 清理临时文件
if temp_path:
cleanup_temp_file(temp_path)
@app.post("/api/detect-language")
async def detect_language_endpoint(file: UploadFile = File(...)):
"""
PDF语言检测接口
Args:
file: 上传的PDF文件
Returns:
{
"language": "chinese" | "english" | "mixed",
"chinese_ratio": 0.65,
"chinese_chars": 3500,
"total_chars": 5000
}
"""
temp_path = None
try:
if not file.filename.lower().endswith('.pdf'):
raise HTTPException(status_code=400, detail="只支持PDF文件")
# 保存临时文件
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
with open(temp_path, "wb") as f:
content = await file.read()
f.write(content)
# 检测语言
result = detect_language_detailed(str(temp_path))
result["filename"] = file.filename
return JSONResponse(content=result)
except HTTPException:
raise
except Exception as e:
logger.error(f"语言检测失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"检测失败: {str(e)}")
finally:
if temp_path:
cleanup_temp_file(temp_path)
@app.post("/api/pdf-strategy")
async def get_strategy_endpoint(file: UploadFile = File(...)):
"""
获取PDF处理策略不实际提取
Args:
file: 上传的PDF文件
Returns:
{
"detected_language": "chinese" | "english",
"recommended_method": "nougat" | "pymupdf",
"reason": "...",
"nougat_available": true
}
"""
temp_path = None
try:
if not file.filename.lower().endswith('.pdf'):
raise HTTPException(status_code=400, detail="只支持PDF文件")
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
with open(temp_path, "wb") as f:
content = await file.read()
f.write(content)
# 获取处理策略
result = get_pdf_processing_strategy(str(temp_path))
result["filename"] = file.filename
return JSONResponse(content=result)
except HTTPException:
raise
except Exception as e:
logger.error(f"获取策略失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"失败: {str(e)}")
finally:
if temp_path:
cleanup_temp_file(temp_path)
@app.post("/api/extract/docx")
async def extract_docx_endpoint(file: UploadFile = File(...)):
"""
Docx文档提取接口
Args:
file: 上传的Docx文件
Returns:
{
"success": true,
"method": "mammoth",
"text": "提取的文本内容",
"metadata": {
"char_count": 字符数,
"has_tables": 是否包含表格,
"file_size": 文件大小
}
}
"""
temp_path = None
try:
# 验证文件类型
if not file.filename.lower().endswith('.docx'):
raise HTTPException(
status_code=400,
detail="文件格式错误只支持Docx文件"
)
# 保存临时文件
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
logger.info(f"开始处理Docx文件: {file.filename}")
with open(temp_path, "wb") as f:
content = await file.read()
f.write(content)
file_size = len(content)
logger.info(f"文件大小: {file_size / 1024:.2f} KB")
# 提取文本
result = extract_docx_mammoth(str(temp_path))
if not result["success"]:
raise HTTPException(
status_code=500,
detail=f"Docx提取失败: {result.get('error', 'Unknown error')}"
)
# 添加文件元数据
result["method"] = "mammoth"
result["metadata"]["filename"] = file.filename
logger.info(f"Docx提取成功: {file.filename}, "
f"字符数={result['metadata']['char_count']}")
return JSONResponse(content=result)
except HTTPException:
raise
except Exception as e:
logger.error(f"Docx提取失败: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"处理失败: {str(e)}"
)
finally:
if temp_path:
cleanup_temp_file(temp_path)
@app.post("/api/extract/txt")
async def extract_txt_endpoint(file: UploadFile = File(...)):
"""
Txt文本文件提取接口
Args:
file: 上传的Txt文件
Returns:
{
"success": true,
"method": "direct",
"text": "文本内容",
"encoding": "utf-8",
"metadata": {
"char_count": 字符数,
"line_count": 行数,
"file_size": 文件大小
}
}
"""
temp_path = None
try:
# 验证文件类型
if not file.filename.lower().endswith('.txt'):
raise HTTPException(
status_code=400,
detail="文件格式错误只支持Txt文件"
)
# 保存临时文件
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
logger.info(f"开始处理Txt文件: {file.filename}")
with open(temp_path, "wb") as f:
content = await file.read()
f.write(content)
file_size = len(content)
logger.info(f"文件大小: {file_size / 1024:.2f} KB")
# 提取文本
result = extract_txt(str(temp_path))
if not result["success"]:
raise HTTPException(
status_code=500,
detail=f"Txt提取失败: {result.get('error', 'Unknown error')}"
)
# 添加方法标识和文件名
result["method"] = "direct"
result["metadata"]["filename"] = file.filename
logger.info(f"Txt提取成功: {file.filename}, "
f"编码={result['encoding']}, "
f"字符数={result['metadata']['char_count']}")
return JSONResponse(content=result)
except HTTPException:
raise
except Exception as e:
logger.error(f"Txt提取失败: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"处理失败: {str(e)}"
)
finally:
if temp_path:
cleanup_temp_file(temp_path)
@app.post("/api/extract")
async def extract_document(
file: UploadFile = File(...),
file_type: str = None
):
"""
通用文档提取接口
自动检测文件类型并调用相应的提取方法
Args:
file: 上传的文件
file_type: 可选,指定文件类型 ('pdf' | 'docx' | 'txt')
Returns:
提取结果
"""
try:
# 自动检测文件类型
if not file_type:
file_type = detect_file_type(file.filename)
logger.info(f"文件类型: {file_type}, 文件名: {file.filename}")
# 根据类型调用不同的处理函数
if file_type == 'pdf':
return await extract_pdf_endpoint(file)
elif file_type == 'docx':
return await extract_docx_endpoint(file)
elif file_type == 'txt':
return await extract_txt_endpoint(file)
else:
raise HTTPException(
status_code=400,
detail=f"不支持的文件格式: {file_type}仅支持PDF、Docx、Txt"
)
except HTTPException:
raise
except Exception as e:
logger.error(f"文档提取失败: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"处理失败: {str(e)}"
)
# ==================== RAG 引擎 - 文档转 Markdown 接口 ====================
@app.post("/api/document/to-markdown")
async def document_to_markdown(
file: UploadFile = File(...),
file_type: Optional[str] = None
):
"""
RAG 引擎 - 文档转 Markdown 接口
将各种格式的文档PDF、Word、TXT 等)转换为 LLM 友好的 Markdown 格式。
这是知识库引擎的核心文档处理接口。
Args:
file: 上传的文件
file_type: 可选,指定文件类型 ('pdf' | 'docx' | 'txt' | 'md')
Returns:
{
"success": true,
"text": "# 文档标题\\n\\n文档内容...",
"format": "markdown",
"metadata": {
"original_file_type": "pdf",
"char_count": 12345,
"filename": "example.pdf"
}
}
Raises:
400: 不支持的文件格式
500: 处理失败
"""
temp_path = None
try:
# 保存上传的文件到临时目录
temp_path = TEMP_DIR / file.filename
with open(temp_path, "wb") as f:
content = await file.read()
f.write(content)
logger.info(f"RAG 文档处理: {file.filename}, 大小: {len(content)} bytes")
# 调用统一文档处理器
result = await convert_to_markdown(str(temp_path), file_type)
# 补充文件名到 metadata
if result.get("metadata"):
result["metadata"]["filename"] = file.filename
else:
result["metadata"] = {"filename": file.filename}
return JSONResponse(content=result)
except ValueError as e:
logger.warning(f"文档格式不支持: {file.filename}, 错误: {e}")
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"文档转 Markdown 失败: {file.filename}, 错误: {e}")
raise HTTPException(status_code=500, detail=f"处理失败: {str(e)}")
finally:
# 清理临时文件
if temp_path and temp_path.exists():
cleanup_temp_file(str(temp_path))
# ==================== DC工具C - 代码执行接口 ====================
@app.post("/api/dc/validate")
async def validate_pandas_code(request: ValidateCodeRequest):
"""
DC工具C - Pandas代码安全验证接口
Args:
request: ValidateCodeRequest
- code: str # 待验证的Pandas代码
Returns:
{
"valid": bool,
"errors": List[str],
"warnings": List[str]
}
"""
try:
logger.info(f"开始验证Pandas代码长度: {len(request.code)} 字符")
# 执行AST安全检查
result = validate_code(request.code)
logger.info(
f"代码验证完成: valid={result['valid']}, "
f"errors={len(result['errors'])}, warnings={len(result['warnings'])}"
)
return JSONResponse(content=result)
except HTTPException:
raise
except Exception as e:
logger.error(f"代码验证失败: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"验证失败: {str(e)}"
)
@app.post("/api/dc/execute")
async def execute_pandas_code_endpoint(request: ExecuteCodeRequest):
"""
DC工具C - Pandas代码执行接口
Args:
request: ExecuteCodeRequest
- data: List[Dict] # JSON格式的数据数组对象
- code: str # Pandas代码操作df变量
Returns:
{
"success": bool,
"result_data": List[Dict], # 执行后的数据
"output": str, # 打印输出
"error": str, # 错误信息(如果失败)
"execution_time": float, # 执行时间(秒)
"result_shape": [rows, cols] # 结果形状
}
"""
try:
logger.info(
f"开始执行Pandas代码: "
f"数据行数={len(request.data)}, 代码长度={len(request.code)} 字符"
)
# 执行代码
result = execute_pandas_code(request.data, request.code)
if result["success"]:
logger.info(
f"代码执行成功: "
f"结果shape={result.get('result_shape')}, "
f"耗时={result['execution_time']:.3f}"
)
else:
logger.warning(
f"代码执行失败: {result.get('error', 'Unknown error')}"
)
return JSONResponse(content=result)
except HTTPException:
raise
except Exception as e:
logger.error(f"代码执行接口失败: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"处理失败: {str(e)}"
)
# ==================== ✨ 预写函数API端点 ====================
@app.post("/api/operations/filter")
async def operation_filter(request: FilterRequest):
"""
高级筛选操作(预写函数)
Args:
request: FilterRequest
- data: List[Dict] # 输入数据
- conditions: List[Dict] # 筛选条件
- logic: str # 'and''or'
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float,
"result_shape": [rows, cols]
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = apply_filter(df, request.conditions, request.logic)
# 转换回JSON处理NaN和inf值
import numpy as np
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"筛选成功: {len(request.data)}{len(result_data)}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"筛选操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time
})
@app.post("/api/operations/recode")
async def operation_recode(request: RecodeRequest):
"""
数值映射(重编码)操作(预写函数)
Args:
request: RecodeRequest
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数传递NA处理参数
result_df = apply_recode(
df,
request.column,
request.mapping,
request.create_new_column,
request.new_column_name,
request.na_handling, # ✨ NA处理方式
request.na_value # ✨ NA映射值
)
# 转换回JSON处理NaN和inf值
import numpy as np
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"重编码成功: {request.column}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"重编码操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
})
@app.post("/api/operations/binning")
async def operation_binning(request: BinningRequest):
"""
生成分类变量(分箱)操作(预写函数)
Args:
request: BinningRequest
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数传递NA处理参数
result_df = apply_binning(
df,
request.column,
request.method,
request.new_column_name,
request.bins,
request.labels,
request.num_bins,
request.na_handling, # ✨ NA处理方式
request.na_label, # ✨ NA标签
request.na_assign_to # ✨ NA分配到的组索引
)
# 转换回JSON处理Categorical类型、NaN值和inf值
import numpy as np
# 1. 将Categorical列转为字符串
for col in result_df.columns:
if pd.api.types.is_categorical_dtype(result_df[col]):
result_df[col] = result_df[col].astype(str)
# 2. 替换inf和-inf为None
result_df = result_df.replace([np.inf, -np.inf], None)
# 3. 将NaN替换为None避免JSON序列化错误
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
# 4. 转换为dict
result_data = result_df.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"分箱成功: {request.column}{request.new_column_name}")
# 使用json.dumps手动序列化确保NaN完全处理
import json
response_content = {
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
}
# 手动序列化NaN会被转为null
json_str = json.dumps(response_content, allow_nan=True)
# 替换NaN为null防止任何遗漏的NaN
json_str = json_str.replace('NaN', 'null').replace('Infinity', 'null').replace('-Infinity', 'null')
return JSONResponse(content=json.loads(json_str))
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"分箱操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
})
@app.post("/api/operations/conditional")
async def operation_conditional(request: ConditionalRequest):
"""
条件生成列操作(预写函数)
根据多条件IF-THEN-ELSE规则生成新列
Args:
request: ConditionalRequest
- data: 数据
- new_column_name: 新列名称
- rules: 规则列表,每个规则包含 conditions, logic, result
- else_value: 默认值
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = apply_conditional_column(
df,
request.new_column_name,
request.rules,
request.else_value
)
# 转换回JSON处理NaN和inf值
import numpy as np
result_df = result_df.replace([np.inf, -np.inf], None)
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"条件生成列成功: {request.new_column_name}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"条件生成列操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/dropna")
async def operation_dropna(request: DropnaRequest):
"""
删除缺失值操作(预写函数)
Args:
request: DropnaRequest
- data: 数据
- method: 删除方式 ('row', 'column', 'both')
- threshold: 缺失率阈值0-1
- subset: 仅检查指定列(可选)
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = drop_missing_values(
df,
method=request.method,
threshold=request.threshold,
subset=request.subset
)
# 转换回JSON处理NaN和inf值
import numpy as np
result_df = result_df.replace([np.inf, -np.inf], None)
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"删除缺失值成功: {request.method}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"删除缺失值操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/compute")
async def operation_compute(request: ComputeRequest):
"""
计算列操作(预写函数)
基于公式计算新列
Args:
request: ComputeRequest
- data: 数据
- new_column_name: 新列名称
- formula: 计算公式
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# ✨ 调用预写函数传递column_mapping
result_df = compute_column(
df,
request.new_column_name,
request.formula,
request.column_mapping # ✨ 传递列名映射
)
# 转换回JSON处理NaN值和inf值
import numpy as np
# 1. 替换inf和-inf为None
result_df = result_df.replace([np.inf, -np.inf], None)
# 2. 替换NaN为None
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
# 3. 转换为dict此时已经没有NaN和inf
result_data = result_df.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"计算列成功: {request.new_column_name}")
# 使用json.dumps手动序列化处理NaN
import json
response_content = {
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
}
# 手动序列化NaN会被转为null
json_str = json.dumps(response_content, allow_nan=True)
# 替换NaN为null
json_str = json_str.replace('NaN', 'null').replace('Infinity', 'null').replace('-Infinity', 'null')
return JSONResponse(content=json.loads(json_str))
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"计算列操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/pivot")
async def operation_pivot(request: PivotRequest):
"""
Pivot操作长表转宽表预写函数
将纵向重复数据转为横向数据
Args:
request: PivotRequest
- data: 数据
- index_column: 索引列
- pivot_column: 透视列
- value_columns: 值列列表
- aggfunc: 聚合函数
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import numpy as np
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# ✨ 调用预写函数传递column_mapping和未选择列处理参数
result_df = pivot_long_to_wide(
df,
request.index_column,
request.pivot_column,
request.value_columns,
request.aggfunc,
request.column_mapping, # ✨ 传递列名映射
request.keep_unused_columns, # ✨ 是否保留未选择的列
request.unused_agg_method, # ✨ 未选择列的聚合方式
request.original_column_order, # ✨ 原始列顺序
request.pivot_value_order # ✨ 透视列值的原始顺序
)
# 转换回JSON处理NaN和inf值
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"Pivot成功: {request.index_column} × {request.pivot_column}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"Pivot操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/unpivot")
async def operation_unpivot(request: UnpivotRequest):
"""
Unpivot操作宽表转长表预写函数
将横向数据转为纵向重复数据
典型医学场景:
- 多时间点随访数据FMA_基线、FMA_2周 → 时间点列 + FMA值列
- 多指标合并分析(收缩压、舒张压 → 指标列 + 测量值列)
Args:
request: UnpivotRequest
- data: 数据
- id_vars: ID列保持不变的列
- value_vars: 值列(需要转换的列)
- var_name: 变量名列名(默认:"变量"
- value_name: 值列名(默认:""
- parse_column_names: 是否解析列名默认False
- separator: 分隔符(默认:"_"
- metric_name: 指标列名(可选)
- time_name: 时间列名(可选)
- dropna: 是否删除缺失值行默认False
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float,
"result_shape": [rows, cols]
}
"""
try:
import pandas as pd
import numpy as np
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# ✨ 调用预写函数
result_df = apply_unpivot(
df,
request.id_vars,
request.value_vars,
request.var_name,
request.value_name,
request.parse_column_names,
request.separator,
request.metric_name,
request.time_name,
request.dropna
)
# 转换回JSON处理NaN和inf值
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"Unpivot成功: {len(request.id_vars)} ID列 × {len(request.value_vars)} 值列 → {len(result_data)}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"Unpivot操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/metric-time/detect")
async def operation_metric_time_detect(request: MetricTimeDetectRequest):
"""
检测指标-时间表转换模式
自动分析列名,检测:
- 公共前缀(指标名)
- 分隔符
- 时间点列表
- 置信度
Args:
request: MetricTimeDetectRequest
- value_vars: 值列列表
Returns:
{
"success": bool,
"pattern": {
"common_prefix": str,
"separator": str,
"timepoints": List[str],
"confidence": float,
"message": str
}
}
"""
try:
import time
start_time = time.time()
logger.info(f"检测指标-时间表模式: {len(request.value_vars)}")
# 调用检测函数
pattern = detect_common_pattern(request.value_vars)
execution_time = time.time() - start_time
logger.info(f"模式检测完成: confidence={pattern.get('confidence', 0):.2f}")
return JSONResponse(content={
"success": pattern['success'],
"pattern": pattern,
"execution_time": execution_time
})
except Exception as e:
logger.error(f"模式检测失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/metric-time")
async def operation_metric_time_transform(request: MetricTimeTransformRequest):
"""
指标-时间表转换操作(预写函数)
将多个时间点列转换为"指标行+时间点列"格式
典型场景:
- 制作临床研究Table 1
- 横向对比同一指标的时间变化
Args:
request: MetricTimeTransformRequest
- data: 数据
- id_vars: ID列保持不变
- value_vars: 值列(同一指标的多个时间点)
- metric_name: 指标名称(可选,自动检测)
- separator: 分隔符(可选,自动检测)
- timepoint_col_name: 时间点列名
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float,
"result_shape": [rows, cols]
}
"""
try:
import pandas as pd
import numpy as np
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# ✨ 调用预写函数
result_df = apply_metric_time_transform(
df,
request.id_vars,
request.value_vars,
request.metric_name,
request.separator,
request.timepoint_col_name
)
# 转换回JSON处理NaN和inf值
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"指标-时间表转换成功: {len(request.id_vars)} ID列 × {len(request.value_vars)} 值列 → {len(result_df.columns)}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"指标-时间表转换失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
# ==================== 多指标转换API ====================
@app.post("/api/operations/multi-metric/detect")
async def operation_multi_metric_detect(request: MultiMetricDetectRequest):
"""
多指标自动分组检测
检测多个指标的列并自动分组
Args:
request: MultiMetricDetectRequest
- value_vars: 值列列表
- separators: 可选的分隔符列表
Returns:
{
"success": bool,
"metric_groups": Dict[str, List[str]], # 指标分组
"separator": str, # 检测到的分隔符
"timepoints": List[str], # 时间点列表
"confidence": float, # 置信度
"message": str
}
"""
try:
result = detect_metric_groups(
request.value_vars,
request.separators
)
logger.info(f"多指标分组检测: {len(request.value_vars)} 列 → {len(result.get('metric_groups', {}))} 个指标")
return JSONResponse(content=result)
except Exception as e:
logger.error(f"多指标分组检测失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e)
}, status_code=400)
@app.post("/api/operations/multi-metric/to-long")
async def operation_multi_metric_to_long(request: MultiMetricToLongRequest):
"""
多指标转长表(时间点为行,指标为列)
将多个指标的宽表转换为长表格式,适合统计分析和可视化
典型场景:
- 纵向研究数据分析
- 重复测量数据准备
- 混合效应模型、GEE分析
- 数据可视化ggplot2、seaborn
Args:
request: MultiMetricToLongRequest
- data: 数据
- id_vars: ID列
- value_vars: 值列(多个指标的多个时间点)
- separators: 可选的分隔符列表
- event_col_name: 时间点列名
Returns:
{
"success": bool,
"result_data": List[Dict],
"grouping": {...}, # 分组信息
"output": str,
"execution_time": float,
"result_shape": [rows, cols]
}
"""
try:
import pandas as pd
import numpy as np
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 1. 先检测分组
grouping = detect_metric_groups(
request.value_vars,
request.separators
)
if not grouping['success']:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
return JSONResponse(content={
"success": False,
"error": grouping['message'],
"output": output
}, status_code=400)
# 2. 执行转换
result_df = apply_multi_metric_to_long(
df,
request.id_vars,
grouping['metric_groups'],
grouping['separator'],
request.event_col_name
)
# 转换回JSON处理NaN和inf值
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"多指标转长表成功: {len(grouping['metric_groups'])} 指标 × {len(grouping['timepoints'])} 时间点 → {len(result_df)}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"grouping": grouping,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"多指标转长表失败: {str(e)}")
import traceback
traceback.print_exc()
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/multi-metric/to-matrix")
async def operation_multi_metric_to_matrix(request: MultiMetricToMatrixRequest):
"""
多指标转矩阵(时间点为列,指标为行)
将多个指标的宽表转换为矩阵格式,适合临床报告和数据审查
典型场景:
- 临床研究报告
- 数据审查表
- CRF核对
- 单受试者数据审查
Args:
request: MultiMetricToMatrixRequest
- data: 数据
- id_vars: ID列
- value_vars: 值列(多个指标的多个时间点)
- separators: 可选的分隔符列表
- metric_col_name: 指标列名
Returns:
{
"success": bool,
"result_data": List[Dict],
"grouping": {...}, # 分组信息
"output": str,
"execution_time": float,
"result_shape": [rows, cols]
}
"""
try:
import pandas as pd
import numpy as np
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 1. 先检测分组
grouping = detect_metric_groups(
request.value_vars,
request.separators
)
if not grouping['success']:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
return JSONResponse(content={
"success": False,
"error": grouping['message'],
"output": output
}, status_code=400)
# 2. 执行转换
result_df = apply_multi_metric_to_matrix(
df,
request.id_vars,
grouping['metric_groups'],
grouping['separator'],
'Event_Name',
request.metric_col_name
)
# 转换回JSON处理NaN和inf值
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"多指标转矩阵成功: {len(grouping['metric_groups'])} 指标 × {len(grouping['timepoints'])} 时间点 → {len(result_df)}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"grouping": grouping,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"多指标转矩阵失败: {str(e)}")
import traceback
traceback.print_exc()
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/fillna-stats")
async def operation_fillna_stats(request: FillnaStatsRequest):
"""
获取列的缺失值统计信息
Args:
request: FillnaStatsRequest
- data: 数据
- column: 列名
Returns:
{
"success": bool,
"stats": Dict (缺失值统计信息),
"execution_time": float
}
"""
try:
import pandas as pd
import time
start_time = time.time()
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用统计函数
stats = get_column_missing_stats(df, request.column)
execution_time = time.time() - start_time
logger.info(f"获取列 '{request.column}' 的缺失值统计成功")
return JSONResponse(content={
"success": True,
"stats": stats,
"execution_time": execution_time
})
except Exception as e:
logger.error(f"获取缺失值统计失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/fillna-simple")
async def operation_fillna_simple(request: FillnaSimpleRequest):
"""
简单填补缺失值(均值、中位数、众数、固定值、前向、后向)
Args:
request: FillnaSimpleRequest
- data: 数据
- column: 原始列名
- new_column_name: 新列名
- method: 填补方法
- fill_value: 固定值method='constant'时使用)
Returns:
{
"success": bool,
"result_data": List[Dict],
"stats": Dict (填补统计信息),
"message": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
start_time = time.time()
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用填补函数
result = fillna_simple(
df,
request.column,
request.new_column_name,
request.method,
request.fill_value
)
execution_time = time.time() - start_time
logger.info(f"简单填补成功: {request.method} on '{request.column}'")
return JSONResponse(content={
"success": result['success'],
"result_data": result['result_data'],
"stats": result['stats'],
"message": result['message'],
"execution_time": execution_time
})
except Exception as e:
logger.error(f"简单填补失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/fillna-mice")
async def operation_fillna_mice(request: FillnaMiceRequest):
"""
MICE多重插补
Args:
request: FillnaMiceRequest
- data: 数据
- columns: 要填补的列名列表
- n_iterations: 迭代次数
- random_state: 随机种子
Returns:
{
"success": bool,
"result_data": List[Dict],
"stats": Dict (各列的填补统计信息),
"message": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
start_time = time.time()
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用MICE填补函数
result = fillna_mice(
df,
request.columns,
request.reference_columns, # ⭐ 新增:传递参考列
request.n_iterations,
request.random_state
)
execution_time = time.time() - start_time
logger.info(f"MICE填补成功: {len(request.columns)}")
return JSONResponse(content={
"success": result['success'],
"result_data": result['result_data'],
"stats": result['stats'],
"message": result['message'],
"execution_time": execution_time
})
except Exception as e:
logger.error(f"MICE填补失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
# ==================== Word 导出 API ====================
@app.get("/api/pandoc/status")
async def pandoc_status():
"""
检查 Pandoc 可用性
Returns:
{
"available": bool,
"version": str,
"message": str
}
"""
try:
result = check_pandoc_available()
logger.info(f"Pandoc 状态检查: {result}")
return JSONResponse(content=result)
except Exception as e:
logger.error(f"Pandoc 状态检查失败: {str(e)}")
return JSONResponse(content={
"available": False,
"version": None,
"message": f"检查失败: {str(e)}"
})
@app.post("/api/convert/docx")
async def convert_to_docx(request: MarkdownToDocxRequest):
"""
Markdown 转 Word 接口
将 Markdown 文本转换为 Word 文档(.docx
Args:
request: MarkdownToDocxRequest
- content: Markdown 内容
- use_template: 是否使用模板(默认 True
- title: 文档标题
Returns:
Word 文档二进制数据application/vnd.openxmlformats-officedocument.wordprocessingml.document
"""
try:
logger.info(f"开始转换 Markdown → Word, 内容长度: {len(request.content)} 字符")
# 执行转换
result = convert_markdown_to_docx(
markdown_text=request.content,
use_template=request.use_template
)
if not result["success"]:
logger.error(f"转换失败: {result.get('error', 'Unknown error')}")
raise HTTPException(
status_code=500,
detail=result.get("error", "转换失败")
)
# 读取生成的文件
output_path = result["output_path"]
with open(output_path, 'rb') as f:
content = f.read()
# 清理临时文件
try:
os.remove(output_path)
except Exception as e:
logger.warning(f"清理临时文件失败: {e}")
logger.info(f"Markdown → Word 转换成功, 文件大小: {len(content)} bytes")
# 返回文件
return Response(
content=content,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
headers={
"Content-Disposition": f'attachment; filename="document.docx"'
}
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Markdown → Word 转换失败: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"转换失败: {str(e)}"
)
@app.post("/api/protocol/export/docx")
async def export_protocol_to_docx(request: ProtocolToDocxRequest):
"""
研究方案导出为 Word 接口
将分章节的研究方案内容导出为格式化的 Word 文档
Args:
request: ProtocolToDocxRequest
- sections: 章节内容字典
- title: 文档标题
Returns:
Word 文档二进制数据
"""
try:
logger.info(f"开始导出研究方案, 章节数: {len(request.sections)}")
# 执行转换
result = create_protocol_docx(
sections=request.sections,
title=request.title
)
if not result["success"]:
logger.error(f"导出失败: {result.get('error', 'Unknown error')}")
raise HTTPException(
status_code=500,
detail=result.get("error", "导出失败")
)
# 读取生成的文件
output_path = result["output_path"]
with open(output_path, 'rb') as f:
content = f.read()
# 清理临时文件
try:
os.remove(output_path)
except Exception as e:
logger.warning(f"清理临时文件失败: {e}")
logger.info(f"研究方案导出成功, 文件大小: {len(content)} bytes")
# 返回文件
return Response(
content=content,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
headers={
"Content-Disposition": f'attachment; filename="research_protocol.docx"'
}
)
except HTTPException:
raise
except Exception as e:
logger.error(f"研究方案导出失败: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"导出失败: {str(e)}"
)
# ==================== 启动配置 ====================
if __name__ == "__main__":
import uvicorn
port = int(os.getenv("SERVICE_PORT", 8000))
host = os.getenv("SERVICE_HOST", "0.0.0.0")
debug = os.getenv("DEBUG", "True").lower() == "true"
logger.info(f"启动文档提取微服务...")
logger.info(f"地址: http://{host}:{port}")
logger.info(f"健康检查: http://{host}:{port}/api/health")
logger.info(f"调试模式: {debug}")
uvicorn.run(
"main:app",
host=host,
port=port,
reload=debug,
log_level="info"
)