- Add one-click research protocol generation with streaming output - Implement Word document export via Pandoc integration - Add dynamic dual-panel layout with resizable split pane - Implement collapsible content for StatePanel stages - Add conversation history management with title auto-update - Fix scroll behavior, markdown rendering, and UI layout issues - Simplify conversation creation logic for reliability
2300 lines
71 KiB
Python
2300 lines
71 KiB
Python
"""
|
||
文档提取微服务 - 主入口
|
||
|
||
功能:
|
||
- PDF文本提取(PyMuPDF)
|
||
- Docx文本提取(Mammoth)
|
||
- Txt文本提取(直接读取)
|
||
- 语言检测
|
||
- 健康检查
|
||
"""
|
||
|
||
from fastapi import FastAPI, File, UploadFile, HTTPException, Response
|
||
from fastapi.middleware.cors import CORSMiddleware
|
||
from fastapi.responses import JSONResponse
|
||
from pydantic import BaseModel
|
||
from typing import List, Dict, Any, Optional
|
||
from loguru import logger
|
||
from pathlib import Path
|
||
import os
|
||
import sys
|
||
from datetime import datetime
|
||
from dotenv import load_dotenv
|
||
|
||
# 加载环境变量
|
||
load_dotenv()
|
||
|
||
# 配置日志
|
||
logger.remove()
|
||
logger.add(
|
||
sys.stdout,
|
||
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
|
||
level=os.getenv("LOG_LEVEL", "INFO")
|
||
)
|
||
|
||
# 创建FastAPI应用
|
||
app = FastAPI(
|
||
title="文档提取微服务",
|
||
description="提供PDF、Docx、Txt文档的文本提取服务",
|
||
version="1.0.0",
|
||
)
|
||
|
||
# CORS配置
|
||
app.add_middleware(
|
||
CORSMiddleware,
|
||
allow_origins=["*"], # 生产环境应该限制具体域名
|
||
allow_credentials=True,
|
||
allow_methods=["*"],
|
||
allow_headers=["*"],
|
||
)
|
||
|
||
# 临时文件目录
|
||
TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service"))
|
||
TEMP_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 导入服务模块
|
||
from services.pdf_extractor import extract_pdf_pymupdf
|
||
from services.pdf_processor import extract_pdf, get_pdf_processing_strategy
|
||
from services.language_detector import detect_language, detect_language_detailed
|
||
from services.file_utils import detect_file_type, cleanup_temp_file
|
||
from services.docx_extractor import extract_docx_mammoth, validate_docx_file
|
||
from services.txt_extractor import extract_txt, validate_txt_file
|
||
from services.dc_executor import validate_code, execute_pandas_code
|
||
# 新增:统一文档处理器(RAG 引擎使用)
|
||
from services.document_processor import DocumentProcessor, convert_to_markdown
|
||
from services.pdf_markdown_processor import PdfMarkdownProcessor, extract_pdf_to_markdown
|
||
# 新增:文档导出服务(Markdown → Word)
|
||
from services.doc_export_service import check_pandoc_available, convert_markdown_to_docx, create_protocol_docx
|
||
|
||
# 兼容:nougat 相关(已废弃,保留空实现避免报错)
|
||
def check_nougat_available(): return False
|
||
def get_nougat_info(): return {"available": False, "reason": "已废弃,使用 pymupdf4llm 替代"}
|
||
|
||
# ✨ 导入预写的数据操作函数
|
||
from operations.filter import apply_filter
|
||
from operations.recode import apply_recode
|
||
from operations.binning import apply_binning
|
||
from operations.conditional import apply_conditional_column, apply_simple_binning
|
||
from operations.dropna import drop_missing_values, get_missing_summary
|
||
from operations.compute import compute_column, get_formula_examples
|
||
from operations.pivot import pivot_long_to_wide, get_pivot_preview
|
||
from operations.unpivot import apply_unpivot, get_unpivot_preview # ✨ 新增:宽表转长表
|
||
from operations.metric_time_transform import (
|
||
apply_metric_time_transform,
|
||
detect_common_pattern,
|
||
preview_metric_time_transform,
|
||
detect_metric_groups, # ✨ 多指标自动分组
|
||
apply_multi_metric_to_long, # ✨ 多指标转长表(方向1)
|
||
preview_multi_metric_to_long, # ✨ 多指标转换预览(方向1)
|
||
apply_multi_metric_to_matrix, # ✨ 多指标转矩阵(方向2)
|
||
preview_multi_metric_to_matrix # ✨ 多指标转换预览(方向2)
|
||
)
|
||
from operations.fillna import fillna_simple, fillna_mice, get_column_missing_stats
|
||
|
||
|
||
# ==================== Pydantic Models ====================
|
||
|
||
class ValidateCodeRequest(BaseModel):
|
||
"""代码验证请求模型"""
|
||
code: str
|
||
|
||
class ExecuteCodeRequest(BaseModel):
|
||
"""代码执行请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
code: str
|
||
|
||
# ✨ 预写函数请求模型
|
||
class FilterRequest(BaseModel):
|
||
"""筛选请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
conditions: List[Dict[str, Any]]
|
||
logic: str = 'and'
|
||
|
||
class RecodeRequest(BaseModel):
|
||
"""重编码请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
column: str
|
||
mapping: Dict[Any, Any]
|
||
create_new_column: bool = True
|
||
new_column_name: str = None
|
||
na_handling: str = 'keep' # ✨ 新增:NA处理方式(keep/map/drop)
|
||
na_value: Any = None # ✨ 新增:NA映射值
|
||
|
||
class BinningRequest(BaseModel):
|
||
"""分箱请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
column: str
|
||
method: str
|
||
new_column_name: str
|
||
bins: List[Any] = None
|
||
labels: List[Any] = None
|
||
num_bins: int = 3
|
||
na_handling: str = 'keep' # ✨ 新增:NA处理方式(keep/label/assign)
|
||
na_label: str = None # ✨ 新增:NA标签
|
||
na_assign_to: int = None # ✨ 新增:NA分配到的组索引
|
||
|
||
class ConditionalRequest(BaseModel):
|
||
"""条件生成列请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
new_column_name: str
|
||
rules: List[Dict[str, Any]]
|
||
else_value: Any = None
|
||
|
||
class DropnaRequest(BaseModel):
|
||
"""删除缺失值请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
method: str # 'row', 'column', 'both'
|
||
threshold: float = 0.5
|
||
subset: List[str] = None
|
||
|
||
class ComputeRequest(BaseModel):
|
||
"""计算列请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
new_column_name: str
|
||
formula: str
|
||
column_mapping: List[Dict[str, str]] = [] # ✨ 新增:列名映射
|
||
|
||
class PivotRequest(BaseModel):
|
||
"""Pivot请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
index_column: str
|
||
pivot_column: str
|
||
value_columns: List[str]
|
||
aggfunc: str = 'first'
|
||
column_mapping: List[Dict[str, str]] = [] # ✨ 列名映射
|
||
keep_unused_columns: bool = False # ✨ 是否保留未选择的列
|
||
unused_agg_method: str = 'first' # ✨ 未选择列的聚合方式(first/mode/mean)
|
||
original_column_order: List[str] = [] # ✨ 新增:原始列顺序
|
||
pivot_value_order: List[str] = [] # ✨ 新增:透视列值的原始顺序
|
||
|
||
|
||
class UnpivotRequest(BaseModel):
|
||
"""Unpivot请求模型(宽表转长表)"""
|
||
data: List[Dict[str, Any]]
|
||
id_vars: List[str] # ID列(保持不变的列)
|
||
value_vars: List[str] # 值列(需要转换的列)
|
||
var_name: str = '变量' # 变量名列名
|
||
value_name: str = '值' # 值列名
|
||
parse_column_names: bool = False # 是否解析列名
|
||
separator: str = '_' # 分隔符
|
||
metric_name: Optional[str] = None # 指标列名
|
||
time_name: Optional[str] = None # 时间列名
|
||
dropna: bool = False # 是否删除缺失值行
|
||
|
||
|
||
class MetricTimeTransformRequest(BaseModel):
|
||
"""指标-时间表转换请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
id_vars: List[str] # ID列(保持不变的列)
|
||
value_vars: List[str] # 值列(同一指标的多个时间点)
|
||
metric_name: Optional[str] = None # 指标名称(如果为None,则自动检测)
|
||
separator: Optional[str] = None # 分隔符(如果为None,则自动检测)
|
||
timepoint_col_name: str = '时间点' # 时间点列名
|
||
|
||
|
||
class MetricTimeDetectRequest(BaseModel):
|
||
"""指标-时间表模式检测请求模型"""
|
||
value_vars: List[str] # 值列(用于检测模式)
|
||
|
||
|
||
class MultiMetricDetectRequest(BaseModel):
|
||
"""多指标分组检测请求模型"""
|
||
value_vars: List[str] # 值列(用于检测分组)
|
||
separators: Optional[List[str]] = None # 可选的分隔符列表
|
||
|
||
|
||
class MultiMetricToLongRequest(BaseModel):
|
||
"""多指标转长表请求模型(方向1)"""
|
||
data: List[Dict[str, Any]]
|
||
id_vars: List[str] # ID列
|
||
value_vars: List[str] # 值列(多个指标的多个时间点)
|
||
separators: Optional[List[str]] = None # 可选的分隔符列表
|
||
event_col_name: str = 'Event_Name' # 时间点列名
|
||
|
||
|
||
class MultiMetricToMatrixRequest(BaseModel):
|
||
"""多指标转矩阵请求模型(方向2)"""
|
||
data: List[Dict[str, Any]]
|
||
id_vars: List[str] # ID列
|
||
value_vars: List[str] # 值列(多个指标的多个时间点)
|
||
separators: Optional[List[str]] = None # 可选的分隔符列表
|
||
metric_col_name: str = '指标名' # 指标列名
|
||
|
||
|
||
class FillnaStatsRequest(BaseModel):
|
||
"""获取列缺失值统计请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
column: str
|
||
|
||
|
||
class FillnaSimpleRequest(BaseModel):
|
||
"""简单填补请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
column: str
|
||
new_column_name: str
|
||
method: str # 'mean', 'median', 'mode', 'constant', 'ffill', 'bfill'
|
||
fill_value: Any = None
|
||
|
||
|
||
class FillnaMiceRequest(BaseModel):
|
||
"""MICE多重插补请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
columns: List[str]
|
||
reference_columns: Optional[List[str]] = None # ⭐ 新增:参考列
|
||
n_iterations: int = 10
|
||
random_state: int = 42
|
||
|
||
|
||
class MarkdownToDocxRequest(BaseModel):
|
||
"""Markdown转Word请求模型"""
|
||
content: str # Markdown 内容
|
||
use_template: bool = True # 是否使用模板
|
||
title: str = "临床研究方案" # 文档标题
|
||
|
||
|
||
class ProtocolToDocxRequest(BaseModel):
|
||
"""研究方案转Word请求模型"""
|
||
sections: Dict[str, str] # 章节内容
|
||
title: str = "临床研究方案" # 文档标题
|
||
|
||
|
||
# ==================== API路由 ====================
|
||
|
||
@app.get("/")
|
||
async def root():
|
||
"""根路径"""
|
||
return {
|
||
"service": "文档提取微服务",
|
||
"version": "1.0.0",
|
||
"status": "running"
|
||
}
|
||
|
||
|
||
@app.get("/api/health")
|
||
async def health_check():
|
||
"""
|
||
健康检查接口
|
||
|
||
检查项:
|
||
- 服务是否运行
|
||
- PyMuPDF是否可用
|
||
- Nougat是否可用
|
||
- 临时目录是否可写
|
||
"""
|
||
try:
|
||
import fitz # PyMuPDF
|
||
pymupdf_version = fitz.__version__
|
||
pymupdf_available = True
|
||
except Exception as e:
|
||
pymupdf_version = "unknown"
|
||
pymupdf_available = False
|
||
logger.warning(f"PyMuPDF不可用: {str(e)}")
|
||
|
||
# 检查Nougat
|
||
nougat_info = get_nougat_info()
|
||
|
||
# 检查临时目录
|
||
temp_dir_writable = TEMP_DIR.exists() and os.access(TEMP_DIR, os.W_OK)
|
||
|
||
return {
|
||
"status": "healthy" if (pymupdf_available and temp_dir_writable) else "degraded",
|
||
"checks": {
|
||
"pymupdf": {
|
||
"available": pymupdf_available,
|
||
"version": pymupdf_version
|
||
},
|
||
"nougat": nougat_info,
|
||
"temp_dir": {
|
||
"path": str(TEMP_DIR),
|
||
"writable": temp_dir_writable
|
||
}
|
||
},
|
||
"timestamp": datetime.now().isoformat()
|
||
}
|
||
|
||
|
||
@app.post("/api/extract/pdf")
|
||
async def extract_pdf_endpoint(
|
||
file: UploadFile = File(...),
|
||
method: str = "auto"
|
||
):
|
||
"""
|
||
PDF文本提取接口(智能选择方法)
|
||
|
||
Args:
|
||
file: 上传的PDF文件
|
||
method: 提取方法 ('auto' | 'nougat' | 'pymupdf')
|
||
- auto: 自动选择(默认)
|
||
- nougat: 强制使用Nougat
|
||
- pymupdf: 强制使用PyMuPDF
|
||
|
||
Returns:
|
||
{
|
||
"success": true,
|
||
"method": "nougat" | "pymupdf",
|
||
"reason": "...",
|
||
"text": "提取的文本内容",
|
||
"metadata": {...}
|
||
}
|
||
"""
|
||
temp_path = None
|
||
|
||
try:
|
||
# 验证文件类型
|
||
if not file.filename.lower().endswith('.pdf'):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail="文件格式错误,只支持PDF文件"
|
||
)
|
||
|
||
# 保存临时文件
|
||
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
|
||
|
||
logger.info(f"开始处理PDF文件: {file.filename}, 方法={method}")
|
||
|
||
with open(temp_path, "wb") as f:
|
||
content = await file.read()
|
||
f.write(content)
|
||
|
||
file_size = len(content)
|
||
logger.info(f"文件大小: {file_size / 1024:.2f} KB")
|
||
|
||
# 提取文本(使用顺序降级策略)
|
||
force_method = None if method == "auto" else method
|
||
result = extract_pdf(str(temp_path), force_method=force_method)
|
||
|
||
if not result["success"]:
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"PDF提取失败: {result.get('error', 'Unknown error')}"
|
||
)
|
||
|
||
# 添加文件元数据
|
||
result["metadata"]["file_size"] = file_size
|
||
result["metadata"]["filename"] = file.filename
|
||
|
||
logger.info(f"PDF提取成功: {file.filename}, "
|
||
f"方法={result['method']}, "
|
||
f"原因={result.get('reason', 'N/A')}")
|
||
|
||
return JSONResponse(content=result)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"PDF提取失败: {str(e)}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"处理失败: {str(e)}"
|
||
)
|
||
|
||
finally:
|
||
# 清理临时文件
|
||
if temp_path:
|
||
cleanup_temp_file(temp_path)
|
||
|
||
|
||
@app.post("/api/detect-language")
|
||
async def detect_language_endpoint(file: UploadFile = File(...)):
|
||
"""
|
||
PDF语言检测接口
|
||
|
||
Args:
|
||
file: 上传的PDF文件
|
||
|
||
Returns:
|
||
{
|
||
"language": "chinese" | "english" | "mixed",
|
||
"chinese_ratio": 0.65,
|
||
"chinese_chars": 3500,
|
||
"total_chars": 5000
|
||
}
|
||
"""
|
||
temp_path = None
|
||
|
||
try:
|
||
if not file.filename.lower().endswith('.pdf'):
|
||
raise HTTPException(status_code=400, detail="只支持PDF文件")
|
||
|
||
# 保存临时文件
|
||
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
|
||
|
||
with open(temp_path, "wb") as f:
|
||
content = await file.read()
|
||
f.write(content)
|
||
|
||
# 检测语言
|
||
result = detect_language_detailed(str(temp_path))
|
||
result["filename"] = file.filename
|
||
|
||
return JSONResponse(content=result)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"语言检测失败: {str(e)}")
|
||
raise HTTPException(status_code=500, detail=f"检测失败: {str(e)}")
|
||
|
||
finally:
|
||
if temp_path:
|
||
cleanup_temp_file(temp_path)
|
||
|
||
|
||
@app.post("/api/pdf-strategy")
|
||
async def get_strategy_endpoint(file: UploadFile = File(...)):
|
||
"""
|
||
获取PDF处理策略(不实际提取)
|
||
|
||
Args:
|
||
file: 上传的PDF文件
|
||
|
||
Returns:
|
||
{
|
||
"detected_language": "chinese" | "english",
|
||
"recommended_method": "nougat" | "pymupdf",
|
||
"reason": "...",
|
||
"nougat_available": true
|
||
}
|
||
"""
|
||
temp_path = None
|
||
|
||
try:
|
||
if not file.filename.lower().endswith('.pdf'):
|
||
raise HTTPException(status_code=400, detail="只支持PDF文件")
|
||
|
||
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
|
||
|
||
with open(temp_path, "wb") as f:
|
||
content = await file.read()
|
||
f.write(content)
|
||
|
||
# 获取处理策略
|
||
result = get_pdf_processing_strategy(str(temp_path))
|
||
result["filename"] = file.filename
|
||
|
||
return JSONResponse(content=result)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"获取策略失败: {str(e)}")
|
||
raise HTTPException(status_code=500, detail=f"失败: {str(e)}")
|
||
|
||
finally:
|
||
if temp_path:
|
||
cleanup_temp_file(temp_path)
|
||
|
||
|
||
@app.post("/api/extract/docx")
|
||
async def extract_docx_endpoint(file: UploadFile = File(...)):
|
||
"""
|
||
Docx文档提取接口
|
||
|
||
Args:
|
||
file: 上传的Docx文件
|
||
|
||
Returns:
|
||
{
|
||
"success": true,
|
||
"method": "mammoth",
|
||
"text": "提取的文本内容",
|
||
"metadata": {
|
||
"char_count": 字符数,
|
||
"has_tables": 是否包含表格,
|
||
"file_size": 文件大小
|
||
}
|
||
}
|
||
"""
|
||
temp_path = None
|
||
|
||
try:
|
||
# 验证文件类型
|
||
if not file.filename.lower().endswith('.docx'):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail="文件格式错误,只支持Docx文件"
|
||
)
|
||
|
||
# 保存临时文件
|
||
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
|
||
|
||
logger.info(f"开始处理Docx文件: {file.filename}")
|
||
|
||
with open(temp_path, "wb") as f:
|
||
content = await file.read()
|
||
f.write(content)
|
||
|
||
file_size = len(content)
|
||
logger.info(f"文件大小: {file_size / 1024:.2f} KB")
|
||
|
||
# 提取文本
|
||
result = extract_docx_mammoth(str(temp_path))
|
||
|
||
if not result["success"]:
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"Docx提取失败: {result.get('error', 'Unknown error')}"
|
||
)
|
||
|
||
# 添加文件元数据
|
||
result["method"] = "mammoth"
|
||
result["metadata"]["filename"] = file.filename
|
||
|
||
logger.info(f"Docx提取成功: {file.filename}, "
|
||
f"字符数={result['metadata']['char_count']}")
|
||
|
||
return JSONResponse(content=result)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"Docx提取失败: {str(e)}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"处理失败: {str(e)}"
|
||
)
|
||
|
||
finally:
|
||
if temp_path:
|
||
cleanup_temp_file(temp_path)
|
||
|
||
|
||
@app.post("/api/extract/txt")
|
||
async def extract_txt_endpoint(file: UploadFile = File(...)):
|
||
"""
|
||
Txt文本文件提取接口
|
||
|
||
Args:
|
||
file: 上传的Txt文件
|
||
|
||
Returns:
|
||
{
|
||
"success": true,
|
||
"method": "direct",
|
||
"text": "文本内容",
|
||
"encoding": "utf-8",
|
||
"metadata": {
|
||
"char_count": 字符数,
|
||
"line_count": 行数,
|
||
"file_size": 文件大小
|
||
}
|
||
}
|
||
"""
|
||
temp_path = None
|
||
|
||
try:
|
||
# 验证文件类型
|
||
if not file.filename.lower().endswith('.txt'):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail="文件格式错误,只支持Txt文件"
|
||
)
|
||
|
||
# 保存临时文件
|
||
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
|
||
|
||
logger.info(f"开始处理Txt文件: {file.filename}")
|
||
|
||
with open(temp_path, "wb") as f:
|
||
content = await file.read()
|
||
f.write(content)
|
||
|
||
file_size = len(content)
|
||
logger.info(f"文件大小: {file_size / 1024:.2f} KB")
|
||
|
||
# 提取文本
|
||
result = extract_txt(str(temp_path))
|
||
|
||
if not result["success"]:
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"Txt提取失败: {result.get('error', 'Unknown error')}"
|
||
)
|
||
|
||
# 添加方法标识和文件名
|
||
result["method"] = "direct"
|
||
result["metadata"]["filename"] = file.filename
|
||
|
||
logger.info(f"Txt提取成功: {file.filename}, "
|
||
f"编码={result['encoding']}, "
|
||
f"字符数={result['metadata']['char_count']}")
|
||
|
||
return JSONResponse(content=result)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"Txt提取失败: {str(e)}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"处理失败: {str(e)}"
|
||
)
|
||
|
||
finally:
|
||
if temp_path:
|
||
cleanup_temp_file(temp_path)
|
||
|
||
|
||
@app.post("/api/extract")
|
||
async def extract_document(
|
||
file: UploadFile = File(...),
|
||
file_type: str = None
|
||
):
|
||
"""
|
||
通用文档提取接口
|
||
|
||
自动检测文件类型并调用相应的提取方法
|
||
|
||
Args:
|
||
file: 上传的文件
|
||
file_type: 可选,指定文件类型 ('pdf' | 'docx' | 'txt')
|
||
|
||
Returns:
|
||
提取结果
|
||
"""
|
||
try:
|
||
# 自动检测文件类型
|
||
if not file_type:
|
||
file_type = detect_file_type(file.filename)
|
||
|
||
logger.info(f"文件类型: {file_type}, 文件名: {file.filename}")
|
||
|
||
# 根据类型调用不同的处理函数
|
||
if file_type == 'pdf':
|
||
return await extract_pdf_endpoint(file)
|
||
elif file_type == 'docx':
|
||
return await extract_docx_endpoint(file)
|
||
elif file_type == 'txt':
|
||
return await extract_txt_endpoint(file)
|
||
else:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"不支持的文件格式: {file_type},仅支持PDF、Docx、Txt"
|
||
)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"文档提取失败: {str(e)}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"处理失败: {str(e)}"
|
||
)
|
||
|
||
|
||
# ==================== RAG 引擎 - 文档转 Markdown 接口 ====================
|
||
|
||
@app.post("/api/document/to-markdown")
|
||
async def document_to_markdown(
|
||
file: UploadFile = File(...),
|
||
file_type: Optional[str] = None
|
||
):
|
||
"""
|
||
RAG 引擎 - 文档转 Markdown 接口
|
||
|
||
将各种格式的文档(PDF、Word、TXT 等)转换为 LLM 友好的 Markdown 格式。
|
||
这是知识库引擎的核心文档处理接口。
|
||
|
||
Args:
|
||
file: 上传的文件
|
||
file_type: 可选,指定文件类型 ('pdf' | 'docx' | 'txt' | 'md')
|
||
|
||
Returns:
|
||
{
|
||
"success": true,
|
||
"text": "# 文档标题\\n\\n文档内容...",
|
||
"format": "markdown",
|
||
"metadata": {
|
||
"original_file_type": "pdf",
|
||
"char_count": 12345,
|
||
"filename": "example.pdf"
|
||
}
|
||
}
|
||
|
||
Raises:
|
||
400: 不支持的文件格式
|
||
500: 处理失败
|
||
"""
|
||
temp_path = None
|
||
try:
|
||
# 保存上传的文件到临时目录
|
||
temp_path = TEMP_DIR / file.filename
|
||
with open(temp_path, "wb") as f:
|
||
content = await file.read()
|
||
f.write(content)
|
||
|
||
logger.info(f"RAG 文档处理: {file.filename}, 大小: {len(content)} bytes")
|
||
|
||
# 调用统一文档处理器
|
||
result = await convert_to_markdown(str(temp_path), file_type)
|
||
|
||
# 补充文件名到 metadata
|
||
if result.get("metadata"):
|
||
result["metadata"]["filename"] = file.filename
|
||
else:
|
||
result["metadata"] = {"filename": file.filename}
|
||
|
||
return JSONResponse(content=result)
|
||
|
||
except ValueError as e:
|
||
logger.warning(f"文档格式不支持: {file.filename}, 错误: {e}")
|
||
raise HTTPException(status_code=400, detail=str(e))
|
||
except Exception as e:
|
||
logger.error(f"文档转 Markdown 失败: {file.filename}, 错误: {e}")
|
||
raise HTTPException(status_code=500, detail=f"处理失败: {str(e)}")
|
||
finally:
|
||
# 清理临时文件
|
||
if temp_path and temp_path.exists():
|
||
cleanup_temp_file(str(temp_path))
|
||
|
||
|
||
# ==================== DC工具C - 代码执行接口 ====================
|
||
|
||
@app.post("/api/dc/validate")
|
||
async def validate_pandas_code(request: ValidateCodeRequest):
|
||
"""
|
||
DC工具C - Pandas代码安全验证接口
|
||
|
||
Args:
|
||
request: ValidateCodeRequest
|
||
- code: str # 待验证的Pandas代码
|
||
|
||
Returns:
|
||
{
|
||
"valid": bool,
|
||
"errors": List[str],
|
||
"warnings": List[str]
|
||
}
|
||
"""
|
||
try:
|
||
logger.info(f"开始验证Pandas代码,长度: {len(request.code)} 字符")
|
||
|
||
# 执行AST安全检查
|
||
result = validate_code(request.code)
|
||
|
||
logger.info(
|
||
f"代码验证完成: valid={result['valid']}, "
|
||
f"errors={len(result['errors'])}, warnings={len(result['warnings'])}"
|
||
)
|
||
|
||
return JSONResponse(content=result)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"代码验证失败: {str(e)}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"验证失败: {str(e)}"
|
||
)
|
||
|
||
|
||
@app.post("/api/dc/execute")
|
||
async def execute_pandas_code_endpoint(request: ExecuteCodeRequest):
|
||
"""
|
||
DC工具C - Pandas代码执行接口
|
||
|
||
Args:
|
||
request: ExecuteCodeRequest
|
||
- data: List[Dict] # JSON格式的数据(数组对象)
|
||
- code: str # Pandas代码(操作df变量)
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict], # 执行后的数据
|
||
"output": str, # 打印输出
|
||
"error": str, # 错误信息(如果失败)
|
||
"execution_time": float, # 执行时间(秒)
|
||
"result_shape": [rows, cols] # 结果形状
|
||
}
|
||
"""
|
||
try:
|
||
logger.info(
|
||
f"开始执行Pandas代码: "
|
||
f"数据行数={len(request.data)}, 代码长度={len(request.code)} 字符"
|
||
)
|
||
|
||
# 执行代码
|
||
result = execute_pandas_code(request.data, request.code)
|
||
|
||
if result["success"]:
|
||
logger.info(
|
||
f"代码执行成功: "
|
||
f"结果shape={result.get('result_shape')}, "
|
||
f"耗时={result['execution_time']:.3f}秒"
|
||
)
|
||
else:
|
||
logger.warning(
|
||
f"代码执行失败: {result.get('error', 'Unknown error')}"
|
||
)
|
||
|
||
return JSONResponse(content=result)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"代码执行接口失败: {str(e)}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"处理失败: {str(e)}"
|
||
)
|
||
|
||
|
||
# ==================== ✨ 预写函数API端点 ====================
|
||
|
||
@app.post("/api/operations/filter")
|
||
async def operation_filter(request: FilterRequest):
|
||
"""
|
||
高级筛选操作(预写函数)
|
||
|
||
Args:
|
||
request: FilterRequest
|
||
- data: List[Dict] # 输入数据
|
||
- conditions: List[Dict] # 筛选条件
|
||
- logic: str # 'and' 或 'or'
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"output": str,
|
||
"execution_time": float,
|
||
"result_shape": [rows, cols]
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 调用预写函数
|
||
result_df = apply_filter(df, request.conditions, request.logic)
|
||
|
||
# 转换回JSON(处理NaN和inf值)
|
||
import numpy as np
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
result_data = result_df_clean.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"筛选成功: {len(request.data)} → {len(result_data)} 行")
|
||
|
||
return JSONResponse(content={
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
})
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"筛选操作失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time
|
||
})
|
||
|
||
|
||
@app.post("/api/operations/recode")
|
||
async def operation_recode(request: RecodeRequest):
|
||
"""
|
||
数值映射(重编码)操作(预写函数)
|
||
|
||
Args:
|
||
request: RecodeRequest
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"output": str,
|
||
"execution_time": float
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 调用预写函数(传递NA处理参数)
|
||
result_df = apply_recode(
|
||
df,
|
||
request.column,
|
||
request.mapping,
|
||
request.create_new_column,
|
||
request.new_column_name,
|
||
request.na_handling, # ✨ NA处理方式
|
||
request.na_value # ✨ NA映射值
|
||
)
|
||
|
||
# 转换回JSON(处理NaN和inf值)
|
||
import numpy as np
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
result_data = result_df_clean.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"重编码成功: {request.column}")
|
||
|
||
return JSONResponse(content={
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
})
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"重编码操作失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
})
|
||
|
||
|
||
@app.post("/api/operations/binning")
|
||
async def operation_binning(request: BinningRequest):
|
||
"""
|
||
生成分类变量(分箱)操作(预写函数)
|
||
|
||
Args:
|
||
request: BinningRequest
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"output": str,
|
||
"execution_time": float
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 调用预写函数(传递NA处理参数)
|
||
result_df = apply_binning(
|
||
df,
|
||
request.column,
|
||
request.method,
|
||
request.new_column_name,
|
||
request.bins,
|
||
request.labels,
|
||
request.num_bins,
|
||
request.na_handling, # ✨ NA处理方式
|
||
request.na_label, # ✨ NA标签
|
||
request.na_assign_to # ✨ NA分配到的组索引
|
||
)
|
||
|
||
# 转换回JSON(处理Categorical类型、NaN值和inf值)
|
||
import numpy as np
|
||
|
||
# 1. 将Categorical列转为字符串
|
||
for col in result_df.columns:
|
||
if pd.api.types.is_categorical_dtype(result_df[col]):
|
||
result_df[col] = result_df[col].astype(str)
|
||
|
||
# 2. 替换inf和-inf为None
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
|
||
# 3. 将NaN替换为None(避免JSON序列化错误)
|
||
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
|
||
# 4. 转换为dict
|
||
result_data = result_df.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"分箱成功: {request.column} → {request.new_column_name}")
|
||
|
||
# 使用json.dumps手动序列化(确保NaN完全处理)
|
||
import json
|
||
response_content = {
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
}
|
||
|
||
# 手动序列化,NaN会被转为null
|
||
json_str = json.dumps(response_content, allow_nan=True)
|
||
# 替换NaN为null(防止任何遗漏的NaN)
|
||
json_str = json_str.replace('NaN', 'null').replace('Infinity', 'null').replace('-Infinity', 'null')
|
||
|
||
return JSONResponse(content=json.loads(json_str))
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"分箱操作失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
})
|
||
|
||
|
||
@app.post("/api/operations/conditional")
|
||
async def operation_conditional(request: ConditionalRequest):
|
||
"""
|
||
条件生成列操作(预写函数)
|
||
|
||
根据多条件IF-THEN-ELSE规则生成新列
|
||
|
||
Args:
|
||
request: ConditionalRequest
|
||
- data: 数据
|
||
- new_column_name: 新列名称
|
||
- rules: 规则列表,每个规则包含 conditions, logic, result
|
||
- else_value: 默认值
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"output": str,
|
||
"execution_time": float
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 调用预写函数
|
||
result_df = apply_conditional_column(
|
||
df,
|
||
request.new_column_name,
|
||
request.rules,
|
||
request.else_value
|
||
)
|
||
|
||
# 转换回JSON(处理NaN和inf值)
|
||
import numpy as np
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
result_data = result_df.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"条件生成列成功: {request.new_column_name}")
|
||
|
||
return JSONResponse(content={
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
})
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"条件生成列操作失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
}, status_code=400)
|
||
|
||
|
||
@app.post("/api/operations/dropna")
|
||
async def operation_dropna(request: DropnaRequest):
|
||
"""
|
||
删除缺失值操作(预写函数)
|
||
|
||
Args:
|
||
request: DropnaRequest
|
||
- data: 数据
|
||
- method: 删除方式 ('row', 'column', 'both')
|
||
- threshold: 缺失率阈值(0-1)
|
||
- subset: 仅检查指定列(可选)
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"output": str,
|
||
"execution_time": float
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 调用预写函数
|
||
result_df = drop_missing_values(
|
||
df,
|
||
method=request.method,
|
||
threshold=request.threshold,
|
||
subset=request.subset
|
||
)
|
||
|
||
# 转换回JSON(处理NaN和inf值)
|
||
import numpy as np
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
result_data = result_df.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"删除缺失值成功: {request.method}")
|
||
|
||
return JSONResponse(content={
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
})
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"删除缺失值操作失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
}, status_code=400)
|
||
|
||
|
||
@app.post("/api/operations/compute")
|
||
async def operation_compute(request: ComputeRequest):
|
||
"""
|
||
计算列操作(预写函数)
|
||
|
||
基于公式计算新列
|
||
|
||
Args:
|
||
request: ComputeRequest
|
||
- data: 数据
|
||
- new_column_name: 新列名称
|
||
- formula: 计算公式
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"output": str,
|
||
"execution_time": float
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# ✨ 调用预写函数(传递column_mapping)
|
||
result_df = compute_column(
|
||
df,
|
||
request.new_column_name,
|
||
request.formula,
|
||
request.column_mapping # ✨ 传递列名映射
|
||
)
|
||
|
||
# 转换回JSON(处理NaN值和inf值)
|
||
import numpy as np
|
||
# 1. 替换inf和-inf为None
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
# 2. 替换NaN为None
|
||
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
# 3. 转换为dict(此时已经没有NaN和inf)
|
||
result_data = result_df.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"计算列成功: {request.new_column_name}")
|
||
|
||
# 使用json.dumps手动序列化(处理NaN)
|
||
import json
|
||
response_content = {
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
}
|
||
|
||
# 手动序列化,NaN会被转为null
|
||
json_str = json.dumps(response_content, allow_nan=True)
|
||
# 替换NaN为null
|
||
json_str = json_str.replace('NaN', 'null').replace('Infinity', 'null').replace('-Infinity', 'null')
|
||
|
||
return JSONResponse(content=json.loads(json_str))
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"计算列操作失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
}, status_code=400)
|
||
|
||
|
||
@app.post("/api/operations/pivot")
|
||
async def operation_pivot(request: PivotRequest):
|
||
"""
|
||
Pivot操作:长表转宽表(预写函数)
|
||
|
||
将纵向重复数据转为横向数据
|
||
|
||
Args:
|
||
request: PivotRequest
|
||
- data: 数据
|
||
- index_column: 索引列
|
||
- pivot_column: 透视列
|
||
- value_columns: 值列列表
|
||
- aggfunc: 聚合函数
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"output": str,
|
||
"execution_time": float
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import numpy as np
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# ✨ 调用预写函数(传递column_mapping和未选择列处理参数)
|
||
result_df = pivot_long_to_wide(
|
||
df,
|
||
request.index_column,
|
||
request.pivot_column,
|
||
request.value_columns,
|
||
request.aggfunc,
|
||
request.column_mapping, # ✨ 传递列名映射
|
||
request.keep_unused_columns, # ✨ 是否保留未选择的列
|
||
request.unused_agg_method, # ✨ 未选择列的聚合方式
|
||
request.original_column_order, # ✨ 原始列顺序
|
||
request.pivot_value_order # ✨ 透视列值的原始顺序
|
||
)
|
||
|
||
# 转换回JSON(处理NaN和inf值)
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
result_data = result_df_clean.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"Pivot成功: {request.index_column} × {request.pivot_column}")
|
||
|
||
return JSONResponse(content={
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
})
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"Pivot操作失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
}, status_code=400)
|
||
|
||
|
||
@app.post("/api/operations/unpivot")
|
||
async def operation_unpivot(request: UnpivotRequest):
|
||
"""
|
||
Unpivot操作:宽表转长表(预写函数)
|
||
|
||
将横向数据转为纵向重复数据
|
||
|
||
典型医学场景:
|
||
- 多时间点随访数据(FMA_基线、FMA_2周 → 时间点列 + FMA值列)
|
||
- 多指标合并分析(收缩压、舒张压 → 指标列 + 测量值列)
|
||
|
||
Args:
|
||
request: UnpivotRequest
|
||
- data: 数据
|
||
- id_vars: ID列(保持不变的列)
|
||
- value_vars: 值列(需要转换的列)
|
||
- var_name: 变量名列名(默认:"变量")
|
||
- value_name: 值列名(默认:"值")
|
||
- parse_column_names: 是否解析列名(默认:False)
|
||
- separator: 分隔符(默认:"_")
|
||
- metric_name: 指标列名(可选)
|
||
- time_name: 时间列名(可选)
|
||
- dropna: 是否删除缺失值行(默认:False)
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"output": str,
|
||
"execution_time": float,
|
||
"result_shape": [rows, cols]
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import numpy as np
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# ✨ 调用预写函数
|
||
result_df = apply_unpivot(
|
||
df,
|
||
request.id_vars,
|
||
request.value_vars,
|
||
request.var_name,
|
||
request.value_name,
|
||
request.parse_column_names,
|
||
request.separator,
|
||
request.metric_name,
|
||
request.time_name,
|
||
request.dropna
|
||
)
|
||
|
||
# 转换回JSON(处理NaN和inf值)
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
result_data = result_df_clean.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"Unpivot成功: {len(request.id_vars)} ID列 × {len(request.value_vars)} 值列 → {len(result_data)} 行")
|
||
|
||
return JSONResponse(content={
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
})
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"Unpivot操作失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
}, status_code=400)
|
||
|
||
|
||
@app.post("/api/operations/metric-time/detect")
|
||
async def operation_metric_time_detect(request: MetricTimeDetectRequest):
|
||
"""
|
||
检测指标-时间表转换模式
|
||
|
||
自动分析列名,检测:
|
||
- 公共前缀(指标名)
|
||
- 分隔符
|
||
- 时间点列表
|
||
- 置信度
|
||
|
||
Args:
|
||
request: MetricTimeDetectRequest
|
||
- value_vars: 值列列表
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"pattern": {
|
||
"common_prefix": str,
|
||
"separator": str,
|
||
"timepoints": List[str],
|
||
"confidence": float,
|
||
"message": str
|
||
}
|
||
}
|
||
"""
|
||
try:
|
||
import time
|
||
|
||
start_time = time.time()
|
||
|
||
logger.info(f"检测指标-时间表模式: {len(request.value_vars)} 列")
|
||
|
||
# 调用检测函数
|
||
pattern = detect_common_pattern(request.value_vars)
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"模式检测完成: confidence={pattern.get('confidence', 0):.2f}")
|
||
|
||
return JSONResponse(content={
|
||
"success": pattern['success'],
|
||
"pattern": pattern,
|
||
"execution_time": execution_time
|
||
})
|
||
|
||
except Exception as e:
|
||
logger.error(f"模式检测失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
}, status_code=400)
|
||
|
||
|
||
@app.post("/api/operations/metric-time")
|
||
async def operation_metric_time_transform(request: MetricTimeTransformRequest):
|
||
"""
|
||
指标-时间表转换操作(预写函数)
|
||
|
||
将多个时间点列转换为"指标行+时间点列"格式
|
||
|
||
典型场景:
|
||
- 制作临床研究Table 1
|
||
- 横向对比同一指标的时间变化
|
||
|
||
Args:
|
||
request: MetricTimeTransformRequest
|
||
- data: 数据
|
||
- id_vars: ID列(保持不变)
|
||
- value_vars: 值列(同一指标的多个时间点)
|
||
- metric_name: 指标名称(可选,自动检测)
|
||
- separator: 分隔符(可选,自动检测)
|
||
- timepoint_col_name: 时间点列名
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"output": str,
|
||
"execution_time": float,
|
||
"result_shape": [rows, cols]
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import numpy as np
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# ✨ 调用预写函数
|
||
result_df = apply_metric_time_transform(
|
||
df,
|
||
request.id_vars,
|
||
request.value_vars,
|
||
request.metric_name,
|
||
request.separator,
|
||
request.timepoint_col_name
|
||
)
|
||
|
||
# 转换回JSON(处理NaN和inf值)
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
result_data = result_df_clean.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"指标-时间表转换成功: {len(request.id_vars)} ID列 × {len(request.value_vars)} 值列 → {len(result_df.columns)} 列")
|
||
|
||
return JSONResponse(content={
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
})
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"指标-时间表转换失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
}, status_code=400)
|
||
|
||
|
||
# ==================== 多指标转换API ====================
|
||
|
||
@app.post("/api/operations/multi-metric/detect")
|
||
async def operation_multi_metric_detect(request: MultiMetricDetectRequest):
|
||
"""
|
||
多指标自动分组检测
|
||
|
||
检测多个指标的列并自动分组
|
||
|
||
Args:
|
||
request: MultiMetricDetectRequest
|
||
- value_vars: 值列列表
|
||
- separators: 可选的分隔符列表
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"metric_groups": Dict[str, List[str]], # 指标分组
|
||
"separator": str, # 检测到的分隔符
|
||
"timepoints": List[str], # 时间点列表
|
||
"confidence": float, # 置信度
|
||
"message": str
|
||
}
|
||
"""
|
||
try:
|
||
result = detect_metric_groups(
|
||
request.value_vars,
|
||
request.separators
|
||
)
|
||
|
||
logger.info(f"多指标分组检测: {len(request.value_vars)} 列 → {len(result.get('metric_groups', {}))} 个指标")
|
||
|
||
return JSONResponse(content=result)
|
||
|
||
except Exception as e:
|
||
logger.error(f"多指标分组检测失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e)
|
||
}, status_code=400)
|
||
|
||
|
||
@app.post("/api/operations/multi-metric/to-long")
|
||
async def operation_multi_metric_to_long(request: MultiMetricToLongRequest):
|
||
"""
|
||
多指标转长表(时间点为行,指标为列)
|
||
|
||
将多个指标的宽表转换为长表格式,适合统计分析和可视化
|
||
|
||
典型场景:
|
||
- 纵向研究数据分析
|
||
- 重复测量数据准备
|
||
- 混合效应模型、GEE分析
|
||
- 数据可视化(ggplot2、seaborn)
|
||
|
||
Args:
|
||
request: MultiMetricToLongRequest
|
||
- data: 数据
|
||
- id_vars: ID列
|
||
- value_vars: 值列(多个指标的多个时间点)
|
||
- separators: 可选的分隔符列表
|
||
- event_col_name: 时间点列名
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"grouping": {...}, # 分组信息
|
||
"output": str,
|
||
"execution_time": float,
|
||
"result_shape": [rows, cols]
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import numpy as np
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 1. 先检测分组
|
||
grouping = detect_metric_groups(
|
||
request.value_vars,
|
||
request.separators
|
||
)
|
||
|
||
if not grouping['success']:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": grouping['message'],
|
||
"output": output
|
||
}, status_code=400)
|
||
|
||
# 2. 执行转换
|
||
result_df = apply_multi_metric_to_long(
|
||
df,
|
||
request.id_vars,
|
||
grouping['metric_groups'],
|
||
grouping['separator'],
|
||
request.event_col_name
|
||
)
|
||
|
||
# 转换回JSON(处理NaN和inf值)
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
result_data = result_df_clean.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"多指标转长表成功: {len(grouping['metric_groups'])} 指标 × {len(grouping['timepoints'])} 时间点 → {len(result_df)} 行")
|
||
|
||
return JSONResponse(content={
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"grouping": grouping,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
})
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"多指标转长表失败: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
}, status_code=400)
|
||
|
||
|
||
@app.post("/api/operations/multi-metric/to-matrix")
|
||
async def operation_multi_metric_to_matrix(request: MultiMetricToMatrixRequest):
|
||
"""
|
||
多指标转矩阵(时间点为列,指标为行)
|
||
|
||
将多个指标的宽表转换为矩阵格式,适合临床报告和数据审查
|
||
|
||
典型场景:
|
||
- 临床研究报告
|
||
- 数据审查表
|
||
- CRF核对
|
||
- 单受试者数据审查
|
||
|
||
Args:
|
||
request: MultiMetricToMatrixRequest
|
||
- data: 数据
|
||
- id_vars: ID列
|
||
- value_vars: 值列(多个指标的多个时间点)
|
||
- separators: 可选的分隔符列表
|
||
- metric_col_name: 指标列名
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"grouping": {...}, # 分组信息
|
||
"output": str,
|
||
"execution_time": float,
|
||
"result_shape": [rows, cols]
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import numpy as np
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 1. 先检测分组
|
||
grouping = detect_metric_groups(
|
||
request.value_vars,
|
||
request.separators
|
||
)
|
||
|
||
if not grouping['success']:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": grouping['message'],
|
||
"output": output
|
||
}, status_code=400)
|
||
|
||
# 2. 执行转换
|
||
result_df = apply_multi_metric_to_matrix(
|
||
df,
|
||
request.id_vars,
|
||
grouping['metric_groups'],
|
||
grouping['separator'],
|
||
'Event_Name',
|
||
request.metric_col_name
|
||
)
|
||
|
||
# 转换回JSON(处理NaN和inf值)
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
result_data = result_df_clean.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"多指标转矩阵成功: {len(grouping['metric_groups'])} 指标 × {len(grouping['timepoints'])} 时间点 → {len(result_df)} 行")
|
||
|
||
return JSONResponse(content={
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"grouping": grouping,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
})
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"多指标转矩阵失败: {str(e)}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
}, status_code=400)
|
||
|
||
|
||
@app.post("/api/operations/fillna-stats")
|
||
async def operation_fillna_stats(request: FillnaStatsRequest):
|
||
"""
|
||
获取列的缺失值统计信息
|
||
|
||
Args:
|
||
request: FillnaStatsRequest
|
||
- data: 数据
|
||
- column: 列名
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"stats": Dict (缺失值统计信息),
|
||
"execution_time": float
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import time
|
||
|
||
start_time = time.time()
|
||
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 调用统计函数
|
||
stats = get_column_missing_stats(df, request.column)
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"获取列 '{request.column}' 的缺失值统计成功")
|
||
|
||
return JSONResponse(content={
|
||
"success": True,
|
||
"stats": stats,
|
||
"execution_time": execution_time
|
||
})
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取缺失值统计失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
}, status_code=400)
|
||
|
||
|
||
@app.post("/api/operations/fillna-simple")
|
||
async def operation_fillna_simple(request: FillnaSimpleRequest):
|
||
"""
|
||
简单填补缺失值(均值、中位数、众数、固定值、前向、后向)
|
||
|
||
Args:
|
||
request: FillnaSimpleRequest
|
||
- data: 数据
|
||
- column: 原始列名
|
||
- new_column_name: 新列名
|
||
- method: 填补方法
|
||
- fill_value: 固定值(method='constant'时使用)
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"stats": Dict (填补统计信息),
|
||
"message": str,
|
||
"execution_time": float
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import time
|
||
|
||
start_time = time.time()
|
||
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 调用填补函数
|
||
result = fillna_simple(
|
||
df,
|
||
request.column,
|
||
request.new_column_name,
|
||
request.method,
|
||
request.fill_value
|
||
)
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"简单填补成功: {request.method} on '{request.column}'")
|
||
|
||
return JSONResponse(content={
|
||
"success": result['success'],
|
||
"result_data": result['result_data'],
|
||
"stats": result['stats'],
|
||
"message": result['message'],
|
||
"execution_time": execution_time
|
||
})
|
||
|
||
except Exception as e:
|
||
logger.error(f"简单填补失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
}, status_code=400)
|
||
|
||
|
||
@app.post("/api/operations/fillna-mice")
|
||
async def operation_fillna_mice(request: FillnaMiceRequest):
|
||
"""
|
||
MICE多重插补
|
||
|
||
Args:
|
||
request: FillnaMiceRequest
|
||
- data: 数据
|
||
- columns: 要填补的列名列表
|
||
- n_iterations: 迭代次数
|
||
- random_state: 随机种子
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"stats": Dict (各列的填补统计信息),
|
||
"message": str,
|
||
"execution_time": float
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import time
|
||
|
||
start_time = time.time()
|
||
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 调用MICE填补函数
|
||
result = fillna_mice(
|
||
df,
|
||
request.columns,
|
||
request.reference_columns, # ⭐ 新增:传递参考列
|
||
request.n_iterations,
|
||
request.random_state
|
||
)
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"MICE填补成功: {len(request.columns)} 列")
|
||
|
||
return JSONResponse(content={
|
||
"success": result['success'],
|
||
"result_data": result['result_data'],
|
||
"stats": result['stats'],
|
||
"message": result['message'],
|
||
"execution_time": execution_time
|
||
})
|
||
|
||
except Exception as e:
|
||
logger.error(f"MICE填补失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
}, status_code=400)
|
||
|
||
|
||
# ==================== Word 导出 API ====================
|
||
|
||
@app.get("/api/pandoc/status")
|
||
async def pandoc_status():
|
||
"""
|
||
检查 Pandoc 可用性
|
||
|
||
Returns:
|
||
{
|
||
"available": bool,
|
||
"version": str,
|
||
"message": str
|
||
}
|
||
"""
|
||
try:
|
||
result = check_pandoc_available()
|
||
logger.info(f"Pandoc 状态检查: {result}")
|
||
return JSONResponse(content=result)
|
||
except Exception as e:
|
||
logger.error(f"Pandoc 状态检查失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"available": False,
|
||
"version": None,
|
||
"message": f"检查失败: {str(e)}"
|
||
})
|
||
|
||
|
||
@app.post("/api/convert/docx")
|
||
async def convert_to_docx(request: MarkdownToDocxRequest):
|
||
"""
|
||
Markdown 转 Word 接口
|
||
|
||
将 Markdown 文本转换为 Word 文档(.docx)
|
||
|
||
Args:
|
||
request: MarkdownToDocxRequest
|
||
- content: Markdown 内容
|
||
- use_template: 是否使用模板(默认 True)
|
||
- title: 文档标题
|
||
|
||
Returns:
|
||
Word 文档二进制数据(application/vnd.openxmlformats-officedocument.wordprocessingml.document)
|
||
"""
|
||
try:
|
||
logger.info(f"开始转换 Markdown → Word, 内容长度: {len(request.content)} 字符")
|
||
|
||
# 执行转换
|
||
result = convert_markdown_to_docx(
|
||
markdown_text=request.content,
|
||
use_template=request.use_template
|
||
)
|
||
|
||
if not result["success"]:
|
||
logger.error(f"转换失败: {result.get('error', 'Unknown error')}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=result.get("error", "转换失败")
|
||
)
|
||
|
||
# 读取生成的文件
|
||
output_path = result["output_path"]
|
||
with open(output_path, 'rb') as f:
|
||
content = f.read()
|
||
|
||
# 清理临时文件
|
||
try:
|
||
os.remove(output_path)
|
||
except Exception as e:
|
||
logger.warning(f"清理临时文件失败: {e}")
|
||
|
||
logger.info(f"Markdown → Word 转换成功, 文件大小: {len(content)} bytes")
|
||
|
||
# 返回文件
|
||
return Response(
|
||
content=content,
|
||
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||
headers={
|
||
"Content-Disposition": f'attachment; filename="document.docx"'
|
||
}
|
||
)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"Markdown → Word 转换失败: {str(e)}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"转换失败: {str(e)}"
|
||
)
|
||
|
||
|
||
@app.post("/api/protocol/export/docx")
|
||
async def export_protocol_to_docx(request: ProtocolToDocxRequest):
|
||
"""
|
||
研究方案导出为 Word 接口
|
||
|
||
将分章节的研究方案内容导出为格式化的 Word 文档
|
||
|
||
Args:
|
||
request: ProtocolToDocxRequest
|
||
- sections: 章节内容字典
|
||
- title: 文档标题
|
||
|
||
Returns:
|
||
Word 文档二进制数据
|
||
"""
|
||
try:
|
||
logger.info(f"开始导出研究方案, 章节数: {len(request.sections)}")
|
||
|
||
# 执行转换
|
||
result = create_protocol_docx(
|
||
sections=request.sections,
|
||
title=request.title
|
||
)
|
||
|
||
if not result["success"]:
|
||
logger.error(f"导出失败: {result.get('error', 'Unknown error')}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=result.get("error", "导出失败")
|
||
)
|
||
|
||
# 读取生成的文件
|
||
output_path = result["output_path"]
|
||
with open(output_path, 'rb') as f:
|
||
content = f.read()
|
||
|
||
# 清理临时文件
|
||
try:
|
||
os.remove(output_path)
|
||
except Exception as e:
|
||
logger.warning(f"清理临时文件失败: {e}")
|
||
|
||
logger.info(f"研究方案导出成功, 文件大小: {len(content)} bytes")
|
||
|
||
# 返回文件
|
||
return Response(
|
||
content=content,
|
||
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||
headers={
|
||
"Content-Disposition": f'attachment; filename="research_protocol.docx"'
|
||
}
|
||
)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"研究方案导出失败: {str(e)}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"导出失败: {str(e)}"
|
||
)
|
||
|
||
|
||
# ==================== 启动配置 ====================
|
||
|
||
if __name__ == "__main__":
|
||
import uvicorn
|
||
|
||
port = int(os.getenv("SERVICE_PORT", 8000))
|
||
host = os.getenv("SERVICE_HOST", "0.0.0.0")
|
||
debug = os.getenv("DEBUG", "True").lower() == "true"
|
||
|
||
logger.info(f"启动文档提取微服务...")
|
||
logger.info(f"地址: http://{host}:{port}")
|
||
logger.info(f"健康检查: http://{host}:{port}/api/health")
|
||
logger.info(f"调试模式: {debug}")
|
||
|
||
uvicorn.run(
|
||
"main:app",
|
||
host=host,
|
||
port=port,
|
||
reload=debug,
|
||
log_level="info"
|
||
)
|
||
|