Critical fixes: 1. Compute column: Add Chinese comma support in formula validation - Problem: Formula with Chinese comma failed validation - Fix: Add Chinese comma character to allowed_chars regex - Example: Support formulas like 'col1(kg)+ col2,col3' 2. Binning operation: Fix NaN serialization error - Problem: 'Out of range float values are not JSON compliant: nan' - Fix: Enhanced NaN/inf handling in binning endpoint - Added np.inf/-np.inf replacement before JSON serialization - Added manual JSON serialization with NaN->null conversion 3. Enhanced all operation endpoints for consistency - Updated conditional, dropna endpoints with same NaN/inf handling - Ensures all operations return JSON-compliant data Modified files: - extraction_service/operations/compute.py: Add Chinese comma to regex - extraction_service/main.py: Enhanced NaN handling in binning/conditional/dropna Status: Hotfix complete, ready for testing
1270 lines
37 KiB
Python
1270 lines
37 KiB
Python
"""
|
||
文档提取微服务 - 主入口
|
||
|
||
功能:
|
||
- PDF文本提取(PyMuPDF)
|
||
- Docx文本提取(Mammoth)
|
||
- Txt文本提取(直接读取)
|
||
- 语言检测
|
||
- 健康检查
|
||
"""
|
||
|
||
from fastapi import FastAPI, File, UploadFile, HTTPException
|
||
from fastapi.middleware.cors import CORSMiddleware
|
||
from fastapi.responses import JSONResponse
|
||
from pydantic import BaseModel
|
||
from typing import List, Dict, Any
|
||
from loguru import logger
|
||
from pathlib import Path
|
||
import os
|
||
import sys
|
||
from datetime import datetime
|
||
from dotenv import load_dotenv
|
||
|
||
# 加载环境变量
|
||
load_dotenv()
|
||
|
||
# 配置日志
|
||
logger.remove()
|
||
logger.add(
|
||
sys.stdout,
|
||
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
|
||
level=os.getenv("LOG_LEVEL", "INFO")
|
||
)
|
||
|
||
# 创建FastAPI应用
|
||
app = FastAPI(
|
||
title="文档提取微服务",
|
||
description="提供PDF、Docx、Txt文档的文本提取服务",
|
||
version="1.0.0",
|
||
)
|
||
|
||
# CORS配置
|
||
app.add_middleware(
|
||
CORSMiddleware,
|
||
allow_origins=["*"], # 生产环境应该限制具体域名
|
||
allow_credentials=True,
|
||
allow_methods=["*"],
|
||
allow_headers=["*"],
|
||
)
|
||
|
||
# 临时文件目录
|
||
TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service"))
|
||
TEMP_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 导入服务模块
|
||
from services.pdf_extractor import extract_pdf_pymupdf
|
||
from services.pdf_processor import extract_pdf, get_pdf_processing_strategy
|
||
from services.language_detector import detect_language, detect_language_detailed
|
||
from services.nougat_extractor import check_nougat_available, get_nougat_info
|
||
from services.file_utils import detect_file_type, cleanup_temp_file
|
||
from services.docx_extractor import extract_docx_mammoth, validate_docx_file
|
||
from services.txt_extractor import extract_txt, validate_txt_file
|
||
from services.dc_executor import validate_code, execute_pandas_code
|
||
|
||
# ✨ 导入预写的数据操作函数
|
||
from operations.filter import apply_filter
|
||
from operations.recode import apply_recode
|
||
from operations.binning import apply_binning
|
||
from operations.conditional import apply_conditional_column, apply_simple_binning
|
||
from operations.dropna import drop_missing_values, get_missing_summary
|
||
from operations.compute import compute_column, get_formula_examples
|
||
from operations.pivot import pivot_long_to_wide, get_pivot_preview
|
||
|
||
|
||
# ==================== Pydantic Models ====================
|
||
|
||
class ValidateCodeRequest(BaseModel):
|
||
"""代码验证请求模型"""
|
||
code: str
|
||
|
||
class ExecuteCodeRequest(BaseModel):
|
||
"""代码执行请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
code: str
|
||
|
||
# ✨ 预写函数请求模型
|
||
class FilterRequest(BaseModel):
|
||
"""筛选请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
conditions: List[Dict[str, Any]]
|
||
logic: str = 'and'
|
||
|
||
class RecodeRequest(BaseModel):
|
||
"""重编码请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
column: str
|
||
mapping: Dict[Any, Any]
|
||
create_new_column: bool = True
|
||
new_column_name: str = None
|
||
|
||
class BinningRequest(BaseModel):
|
||
"""分箱请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
column: str
|
||
method: str
|
||
new_column_name: str
|
||
bins: List[Any] = None
|
||
labels: List[Any] = None
|
||
num_bins: int = 3
|
||
|
||
class ConditionalRequest(BaseModel):
|
||
"""条件生成列请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
new_column_name: str
|
||
rules: List[Dict[str, Any]]
|
||
else_value: Any = None
|
||
|
||
class DropnaRequest(BaseModel):
|
||
"""删除缺失值请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
method: str # 'row', 'column', 'both'
|
||
threshold: float = 0.5
|
||
subset: List[str] = None
|
||
|
||
class ComputeRequest(BaseModel):
|
||
"""计算列请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
new_column_name: str
|
||
formula: str
|
||
|
||
class PivotRequest(BaseModel):
|
||
"""Pivot请求模型"""
|
||
data: List[Dict[str, Any]]
|
||
index_column: str
|
||
pivot_column: str
|
||
value_columns: List[str]
|
||
aggfunc: str = 'first'
|
||
|
||
|
||
# ==================== API路由 ====================
|
||
|
||
@app.get("/")
|
||
async def root():
|
||
"""根路径"""
|
||
return {
|
||
"service": "文档提取微服务",
|
||
"version": "1.0.0",
|
||
"status": "running"
|
||
}
|
||
|
||
|
||
@app.get("/api/health")
|
||
async def health_check():
|
||
"""
|
||
健康检查接口
|
||
|
||
检查项:
|
||
- 服务是否运行
|
||
- PyMuPDF是否可用
|
||
- Nougat是否可用
|
||
- 临时目录是否可写
|
||
"""
|
||
try:
|
||
import fitz # PyMuPDF
|
||
pymupdf_version = fitz.__version__
|
||
pymupdf_available = True
|
||
except Exception as e:
|
||
pymupdf_version = "unknown"
|
||
pymupdf_available = False
|
||
logger.warning(f"PyMuPDF不可用: {str(e)}")
|
||
|
||
# 检查Nougat
|
||
nougat_info = get_nougat_info()
|
||
|
||
# 检查临时目录
|
||
temp_dir_writable = TEMP_DIR.exists() and os.access(TEMP_DIR, os.W_OK)
|
||
|
||
return {
|
||
"status": "healthy" if (pymupdf_available and temp_dir_writable) else "degraded",
|
||
"checks": {
|
||
"pymupdf": {
|
||
"available": pymupdf_available,
|
||
"version": pymupdf_version
|
||
},
|
||
"nougat": nougat_info,
|
||
"temp_dir": {
|
||
"path": str(TEMP_DIR),
|
||
"writable": temp_dir_writable
|
||
}
|
||
},
|
||
"timestamp": datetime.now().isoformat()
|
||
}
|
||
|
||
|
||
@app.post("/api/extract/pdf")
|
||
async def extract_pdf_endpoint(
|
||
file: UploadFile = File(...),
|
||
method: str = "auto"
|
||
):
|
||
"""
|
||
PDF文本提取接口(智能选择方法)
|
||
|
||
Args:
|
||
file: 上传的PDF文件
|
||
method: 提取方法 ('auto' | 'nougat' | 'pymupdf')
|
||
- auto: 自动选择(默认)
|
||
- nougat: 强制使用Nougat
|
||
- pymupdf: 强制使用PyMuPDF
|
||
|
||
Returns:
|
||
{
|
||
"success": true,
|
||
"method": "nougat" | "pymupdf",
|
||
"reason": "...",
|
||
"text": "提取的文本内容",
|
||
"metadata": {...}
|
||
}
|
||
"""
|
||
temp_path = None
|
||
|
||
try:
|
||
# 验证文件类型
|
||
if not file.filename.lower().endswith('.pdf'):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail="文件格式错误,只支持PDF文件"
|
||
)
|
||
|
||
# 保存临时文件
|
||
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
|
||
|
||
logger.info(f"开始处理PDF文件: {file.filename}, 方法={method}")
|
||
|
||
with open(temp_path, "wb") as f:
|
||
content = await file.read()
|
||
f.write(content)
|
||
|
||
file_size = len(content)
|
||
logger.info(f"文件大小: {file_size / 1024:.2f} KB")
|
||
|
||
# 提取文本(使用顺序降级策略)
|
||
force_method = None if method == "auto" else method
|
||
result = extract_pdf(str(temp_path), force_method=force_method)
|
||
|
||
if not result["success"]:
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"PDF提取失败: {result.get('error', 'Unknown error')}"
|
||
)
|
||
|
||
# 添加文件元数据
|
||
result["metadata"]["file_size"] = file_size
|
||
result["metadata"]["filename"] = file.filename
|
||
|
||
logger.info(f"PDF提取成功: {file.filename}, "
|
||
f"方法={result['method']}, "
|
||
f"原因={result.get('reason', 'N/A')}")
|
||
|
||
return JSONResponse(content=result)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"PDF提取失败: {str(e)}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"处理失败: {str(e)}"
|
||
)
|
||
|
||
finally:
|
||
# 清理临时文件
|
||
if temp_path:
|
||
cleanup_temp_file(temp_path)
|
||
|
||
|
||
@app.post("/api/detect-language")
|
||
async def detect_language_endpoint(file: UploadFile = File(...)):
|
||
"""
|
||
PDF语言检测接口
|
||
|
||
Args:
|
||
file: 上传的PDF文件
|
||
|
||
Returns:
|
||
{
|
||
"language": "chinese" | "english" | "mixed",
|
||
"chinese_ratio": 0.65,
|
||
"chinese_chars": 3500,
|
||
"total_chars": 5000
|
||
}
|
||
"""
|
||
temp_path = None
|
||
|
||
try:
|
||
if not file.filename.lower().endswith('.pdf'):
|
||
raise HTTPException(status_code=400, detail="只支持PDF文件")
|
||
|
||
# 保存临时文件
|
||
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
|
||
|
||
with open(temp_path, "wb") as f:
|
||
content = await file.read()
|
||
f.write(content)
|
||
|
||
# 检测语言
|
||
result = detect_language_detailed(str(temp_path))
|
||
result["filename"] = file.filename
|
||
|
||
return JSONResponse(content=result)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"语言检测失败: {str(e)}")
|
||
raise HTTPException(status_code=500, detail=f"检测失败: {str(e)}")
|
||
|
||
finally:
|
||
if temp_path:
|
||
cleanup_temp_file(temp_path)
|
||
|
||
|
||
@app.post("/api/pdf-strategy")
|
||
async def get_strategy_endpoint(file: UploadFile = File(...)):
|
||
"""
|
||
获取PDF处理策略(不实际提取)
|
||
|
||
Args:
|
||
file: 上传的PDF文件
|
||
|
||
Returns:
|
||
{
|
||
"detected_language": "chinese" | "english",
|
||
"recommended_method": "nougat" | "pymupdf",
|
||
"reason": "...",
|
||
"nougat_available": true
|
||
}
|
||
"""
|
||
temp_path = None
|
||
|
||
try:
|
||
if not file.filename.lower().endswith('.pdf'):
|
||
raise HTTPException(status_code=400, detail="只支持PDF文件")
|
||
|
||
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
|
||
|
||
with open(temp_path, "wb") as f:
|
||
content = await file.read()
|
||
f.write(content)
|
||
|
||
# 获取处理策略
|
||
result = get_pdf_processing_strategy(str(temp_path))
|
||
result["filename"] = file.filename
|
||
|
||
return JSONResponse(content=result)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"获取策略失败: {str(e)}")
|
||
raise HTTPException(status_code=500, detail=f"失败: {str(e)}")
|
||
|
||
finally:
|
||
if temp_path:
|
||
cleanup_temp_file(temp_path)
|
||
|
||
|
||
@app.post("/api/extract/docx")
|
||
async def extract_docx_endpoint(file: UploadFile = File(...)):
|
||
"""
|
||
Docx文档提取接口
|
||
|
||
Args:
|
||
file: 上传的Docx文件
|
||
|
||
Returns:
|
||
{
|
||
"success": true,
|
||
"method": "mammoth",
|
||
"text": "提取的文本内容",
|
||
"metadata": {
|
||
"char_count": 字符数,
|
||
"has_tables": 是否包含表格,
|
||
"file_size": 文件大小
|
||
}
|
||
}
|
||
"""
|
||
temp_path = None
|
||
|
||
try:
|
||
# 验证文件类型
|
||
if not file.filename.lower().endswith('.docx'):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail="文件格式错误,只支持Docx文件"
|
||
)
|
||
|
||
# 保存临时文件
|
||
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
|
||
|
||
logger.info(f"开始处理Docx文件: {file.filename}")
|
||
|
||
with open(temp_path, "wb") as f:
|
||
content = await file.read()
|
||
f.write(content)
|
||
|
||
file_size = len(content)
|
||
logger.info(f"文件大小: {file_size / 1024:.2f} KB")
|
||
|
||
# 提取文本
|
||
result = extract_docx_mammoth(str(temp_path))
|
||
|
||
if not result["success"]:
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"Docx提取失败: {result.get('error', 'Unknown error')}"
|
||
)
|
||
|
||
# 添加文件元数据
|
||
result["method"] = "mammoth"
|
||
result["metadata"]["filename"] = file.filename
|
||
|
||
logger.info(f"Docx提取成功: {file.filename}, "
|
||
f"字符数={result['metadata']['char_count']}")
|
||
|
||
return JSONResponse(content=result)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"Docx提取失败: {str(e)}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"处理失败: {str(e)}"
|
||
)
|
||
|
||
finally:
|
||
if temp_path:
|
||
cleanup_temp_file(temp_path)
|
||
|
||
|
||
@app.post("/api/extract/txt")
|
||
async def extract_txt_endpoint(file: UploadFile = File(...)):
|
||
"""
|
||
Txt文本文件提取接口
|
||
|
||
Args:
|
||
file: 上传的Txt文件
|
||
|
||
Returns:
|
||
{
|
||
"success": true,
|
||
"method": "direct",
|
||
"text": "文本内容",
|
||
"encoding": "utf-8",
|
||
"metadata": {
|
||
"char_count": 字符数,
|
||
"line_count": 行数,
|
||
"file_size": 文件大小
|
||
}
|
||
}
|
||
"""
|
||
temp_path = None
|
||
|
||
try:
|
||
# 验证文件类型
|
||
if not file.filename.lower().endswith('.txt'):
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail="文件格式错误,只支持Txt文件"
|
||
)
|
||
|
||
# 保存临时文件
|
||
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
|
||
|
||
logger.info(f"开始处理Txt文件: {file.filename}")
|
||
|
||
with open(temp_path, "wb") as f:
|
||
content = await file.read()
|
||
f.write(content)
|
||
|
||
file_size = len(content)
|
||
logger.info(f"文件大小: {file_size / 1024:.2f} KB")
|
||
|
||
# 提取文本
|
||
result = extract_txt(str(temp_path))
|
||
|
||
if not result["success"]:
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"Txt提取失败: {result.get('error', 'Unknown error')}"
|
||
)
|
||
|
||
# 添加方法标识和文件名
|
||
result["method"] = "direct"
|
||
result["metadata"]["filename"] = file.filename
|
||
|
||
logger.info(f"Txt提取成功: {file.filename}, "
|
||
f"编码={result['encoding']}, "
|
||
f"字符数={result['metadata']['char_count']}")
|
||
|
||
return JSONResponse(content=result)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"Txt提取失败: {str(e)}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"处理失败: {str(e)}"
|
||
)
|
||
|
||
finally:
|
||
if temp_path:
|
||
cleanup_temp_file(temp_path)
|
||
|
||
|
||
@app.post("/api/extract")
|
||
async def extract_document(
|
||
file: UploadFile = File(...),
|
||
file_type: str = None
|
||
):
|
||
"""
|
||
通用文档提取接口
|
||
|
||
自动检测文件类型并调用相应的提取方法
|
||
|
||
Args:
|
||
file: 上传的文件
|
||
file_type: 可选,指定文件类型 ('pdf' | 'docx' | 'txt')
|
||
|
||
Returns:
|
||
提取结果
|
||
"""
|
||
try:
|
||
# 自动检测文件类型
|
||
if not file_type:
|
||
file_type = detect_file_type(file.filename)
|
||
|
||
logger.info(f"文件类型: {file_type}, 文件名: {file.filename}")
|
||
|
||
# 根据类型调用不同的处理函数
|
||
if file_type == 'pdf':
|
||
return await extract_pdf_endpoint(file)
|
||
elif file_type == 'docx':
|
||
return await extract_docx_endpoint(file)
|
||
elif file_type == 'txt':
|
||
return await extract_txt_endpoint(file)
|
||
else:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"不支持的文件格式: {file_type},仅支持PDF、Docx、Txt"
|
||
)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"文档提取失败: {str(e)}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"处理失败: {str(e)}"
|
||
)
|
||
|
||
|
||
# ==================== DC工具C - 代码执行接口 ====================
|
||
|
||
@app.post("/api/dc/validate")
|
||
async def validate_pandas_code(request: ValidateCodeRequest):
|
||
"""
|
||
DC工具C - Pandas代码安全验证接口
|
||
|
||
Args:
|
||
request: ValidateCodeRequest
|
||
- code: str # 待验证的Pandas代码
|
||
|
||
Returns:
|
||
{
|
||
"valid": bool,
|
||
"errors": List[str],
|
||
"warnings": List[str]
|
||
}
|
||
"""
|
||
try:
|
||
logger.info(f"开始验证Pandas代码,长度: {len(request.code)} 字符")
|
||
|
||
# 执行AST安全检查
|
||
result = validate_code(request.code)
|
||
|
||
logger.info(
|
||
f"代码验证完成: valid={result['valid']}, "
|
||
f"errors={len(result['errors'])}, warnings={len(result['warnings'])}"
|
||
)
|
||
|
||
return JSONResponse(content=result)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"代码验证失败: {str(e)}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"验证失败: {str(e)}"
|
||
)
|
||
|
||
|
||
@app.post("/api/dc/execute")
|
||
async def execute_pandas_code_endpoint(request: ExecuteCodeRequest):
|
||
"""
|
||
DC工具C - Pandas代码执行接口
|
||
|
||
Args:
|
||
request: ExecuteCodeRequest
|
||
- data: List[Dict] # JSON格式的数据(数组对象)
|
||
- code: str # Pandas代码(操作df变量)
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict], # 执行后的数据
|
||
"output": str, # 打印输出
|
||
"error": str, # 错误信息(如果失败)
|
||
"execution_time": float, # 执行时间(秒)
|
||
"result_shape": [rows, cols] # 结果形状
|
||
}
|
||
"""
|
||
try:
|
||
logger.info(
|
||
f"开始执行Pandas代码: "
|
||
f"数据行数={len(request.data)}, 代码长度={len(request.code)} 字符"
|
||
)
|
||
|
||
# 执行代码
|
||
result = execute_pandas_code(request.data, request.code)
|
||
|
||
if result["success"]:
|
||
logger.info(
|
||
f"代码执行成功: "
|
||
f"结果shape={result.get('result_shape')}, "
|
||
f"耗时={result['execution_time']:.3f}秒"
|
||
)
|
||
else:
|
||
logger.warning(
|
||
f"代码执行失败: {result.get('error', 'Unknown error')}"
|
||
)
|
||
|
||
return JSONResponse(content=result)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"代码执行接口失败: {str(e)}")
|
||
raise HTTPException(
|
||
status_code=500,
|
||
detail=f"处理失败: {str(e)}"
|
||
)
|
||
|
||
|
||
# ==================== ✨ 预写函数API端点 ====================
|
||
|
||
@app.post("/api/operations/filter")
|
||
async def operation_filter(request: FilterRequest):
|
||
"""
|
||
高级筛选操作(预写函数)
|
||
|
||
Args:
|
||
request: FilterRequest
|
||
- data: List[Dict] # 输入数据
|
||
- conditions: List[Dict] # 筛选条件
|
||
- logic: str # 'and' 或 'or'
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"output": str,
|
||
"execution_time": float,
|
||
"result_shape": [rows, cols]
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 调用预写函数
|
||
result_df = apply_filter(df, request.conditions, request.logic)
|
||
|
||
# 转换回JSON(处理NaN和inf值)
|
||
import numpy as np
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
result_data = result_df_clean.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"筛选成功: {len(request.data)} → {len(result_data)} 行")
|
||
|
||
return JSONResponse(content={
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
})
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"筛选操作失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time
|
||
})
|
||
|
||
|
||
@app.post("/api/operations/recode")
|
||
async def operation_recode(request: RecodeRequest):
|
||
"""
|
||
数值映射(重编码)操作(预写函数)
|
||
|
||
Args:
|
||
request: RecodeRequest
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"output": str,
|
||
"execution_time": float
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 调用预写函数
|
||
result_df = apply_recode(
|
||
df,
|
||
request.column,
|
||
request.mapping,
|
||
request.create_new_column,
|
||
request.new_column_name
|
||
)
|
||
|
||
# 转换回JSON(处理NaN和inf值)
|
||
import numpy as np
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
result_data = result_df_clean.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"重编码成功: {request.column}")
|
||
|
||
return JSONResponse(content={
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
})
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"重编码操作失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
})
|
||
|
||
|
||
@app.post("/api/operations/binning")
|
||
async def operation_binning(request: BinningRequest):
|
||
"""
|
||
生成分类变量(分箱)操作(预写函数)
|
||
|
||
Args:
|
||
request: BinningRequest
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"output": str,
|
||
"execution_time": float
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 调用预写函数
|
||
result_df = apply_binning(
|
||
df,
|
||
request.column,
|
||
request.method,
|
||
request.new_column_name,
|
||
request.bins,
|
||
request.labels,
|
||
request.num_bins
|
||
)
|
||
|
||
# 转换回JSON(处理Categorical类型、NaN值和inf值)
|
||
import numpy as np
|
||
|
||
# 1. 将Categorical列转为字符串
|
||
for col in result_df.columns:
|
||
if pd.api.types.is_categorical_dtype(result_df[col]):
|
||
result_df[col] = result_df[col].astype(str)
|
||
|
||
# 2. 替换inf和-inf为None
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
|
||
# 3. 将NaN替换为None(避免JSON序列化错误)
|
||
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
|
||
# 4. 转换为dict
|
||
result_data = result_df.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"分箱成功: {request.column} → {request.new_column_name}")
|
||
|
||
# 使用json.dumps手动序列化(确保NaN完全处理)
|
||
import json
|
||
response_content = {
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
}
|
||
|
||
# 手动序列化,NaN会被转为null
|
||
json_str = json.dumps(response_content, allow_nan=True)
|
||
# 替换NaN为null(防止任何遗漏的NaN)
|
||
json_str = json_str.replace('NaN', 'null').replace('Infinity', 'null').replace('-Infinity', 'null')
|
||
|
||
return JSONResponse(content=json.loads(json_str))
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"分箱操作失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
})
|
||
|
||
|
||
@app.post("/api/operations/conditional")
|
||
async def operation_conditional(request: ConditionalRequest):
|
||
"""
|
||
条件生成列操作(预写函数)
|
||
|
||
根据多条件IF-THEN-ELSE规则生成新列
|
||
|
||
Args:
|
||
request: ConditionalRequest
|
||
- data: 数据
|
||
- new_column_name: 新列名称
|
||
- rules: 规则列表,每个规则包含 conditions, logic, result
|
||
- else_value: 默认值
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"output": str,
|
||
"execution_time": float
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 调用预写函数
|
||
result_df = apply_conditional_column(
|
||
df,
|
||
request.new_column_name,
|
||
request.rules,
|
||
request.else_value
|
||
)
|
||
|
||
# 转换回JSON(处理NaN和inf值)
|
||
import numpy as np
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
result_data = result_df.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"条件生成列成功: {request.new_column_name}")
|
||
|
||
return JSONResponse(content={
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
})
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"条件生成列操作失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
}, status_code=400)
|
||
|
||
|
||
@app.post("/api/operations/dropna")
|
||
async def operation_dropna(request: DropnaRequest):
|
||
"""
|
||
删除缺失值操作(预写函数)
|
||
|
||
Args:
|
||
request: DropnaRequest
|
||
- data: 数据
|
||
- method: 删除方式 ('row', 'column', 'both')
|
||
- threshold: 缺失率阈值(0-1)
|
||
- subset: 仅检查指定列(可选)
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"output": str,
|
||
"execution_time": float
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 调用预写函数
|
||
result_df = drop_missing_values(
|
||
df,
|
||
method=request.method,
|
||
threshold=request.threshold,
|
||
subset=request.subset
|
||
)
|
||
|
||
# 转换回JSON(处理NaN和inf值)
|
||
import numpy as np
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
result_data = result_df.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"删除缺失值成功: {request.method}")
|
||
|
||
return JSONResponse(content={
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
})
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"删除缺失值操作失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
}, status_code=400)
|
||
|
||
|
||
@app.post("/api/operations/compute")
|
||
async def operation_compute(request: ComputeRequest):
|
||
"""
|
||
计算列操作(预写函数)
|
||
|
||
基于公式计算新列
|
||
|
||
Args:
|
||
request: ComputeRequest
|
||
- data: 数据
|
||
- new_column_name: 新列名称
|
||
- formula: 计算公式
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"output": str,
|
||
"execution_time": float
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 调用预写函数
|
||
result_df = compute_column(
|
||
df,
|
||
request.new_column_name,
|
||
request.formula
|
||
)
|
||
|
||
# 转换回JSON(处理NaN值和inf值)
|
||
import numpy as np
|
||
# 1. 替换inf和-inf为None
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
# 2. 替换NaN为None
|
||
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
# 3. 转换为dict(此时已经没有NaN和inf)
|
||
result_data = result_df.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"计算列成功: {request.new_column_name}")
|
||
|
||
# 使用json.dumps手动序列化(处理NaN)
|
||
import json
|
||
response_content = {
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
}
|
||
|
||
# 手动序列化,NaN会被转为null
|
||
json_str = json.dumps(response_content, allow_nan=True)
|
||
# 替换NaN为null
|
||
json_str = json_str.replace('NaN', 'null').replace('Infinity', 'null').replace('-Infinity', 'null')
|
||
|
||
return JSONResponse(content=json.loads(json_str))
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"计算列操作失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
}, status_code=400)
|
||
|
||
|
||
@app.post("/api/operations/pivot")
|
||
async def operation_pivot(request: PivotRequest):
|
||
"""
|
||
Pivot操作:长表转宽表(预写函数)
|
||
|
||
将纵向重复数据转为横向数据
|
||
|
||
Args:
|
||
request: PivotRequest
|
||
- data: 数据
|
||
- index_column: 索引列
|
||
- pivot_column: 透视列
|
||
- value_columns: 值列列表
|
||
- aggfunc: 聚合函数
|
||
|
||
Returns:
|
||
{
|
||
"success": bool,
|
||
"result_data": List[Dict],
|
||
"output": str,
|
||
"execution_time": float
|
||
}
|
||
"""
|
||
try:
|
||
import pandas as pd
|
||
import numpy as np
|
||
import time
|
||
import io
|
||
import sys
|
||
|
||
start_time = time.time()
|
||
|
||
# 捕获打印输出
|
||
captured_output = io.StringIO()
|
||
sys.stdout = captured_output
|
||
|
||
try:
|
||
# 转换为DataFrame
|
||
df = pd.DataFrame(request.data)
|
||
|
||
# 调用预写函数
|
||
result_df = pivot_long_to_wide(
|
||
df,
|
||
request.index_column,
|
||
request.pivot_column,
|
||
request.value_columns,
|
||
request.aggfunc
|
||
)
|
||
|
||
# 转换回JSON(处理NaN和inf值)
|
||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||
result_data = result_df_clean.to_dict('records')
|
||
|
||
# 恢复stdout
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
|
||
execution_time = time.time() - start_time
|
||
|
||
logger.info(f"Pivot成功: {request.index_column} × {request.pivot_column}")
|
||
|
||
return JSONResponse(content={
|
||
"success": True,
|
||
"result_data": result_data,
|
||
"output": output,
|
||
"execution_time": execution_time,
|
||
"result_shape": [len(result_data), len(result_df.columns)]
|
||
})
|
||
|
||
except Exception as e:
|
||
sys.stdout = sys.__stdout__
|
||
output = captured_output.getvalue()
|
||
raise e
|
||
|
||
except Exception as e:
|
||
logger.error(f"Pivot操作失败: {str(e)}")
|
||
return JSONResponse(content={
|
||
"success": False,
|
||
"error": str(e),
|
||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||
}, status_code=400)
|
||
|
||
|
||
# ==================== 启动配置 ====================
|
||
|
||
if __name__ == "__main__":
|
||
import uvicorn
|
||
|
||
port = int(os.getenv("SERVICE_PORT", 8000))
|
||
host = os.getenv("SERVICE_HOST", "0.0.0.0")
|
||
debug = os.getenv("DEBUG", "True").lower() == "true"
|
||
|
||
logger.info(f"启动文档提取微服务...")
|
||
logger.info(f"地址: http://{host}:{port}")
|
||
logger.info(f"健康检查: http://{host}:{port}/api/health")
|
||
logger.info(f"调试模式: {debug}")
|
||
|
||
uvicorn.run(
|
||
"main:app",
|
||
host=host,
|
||
port=port,
|
||
reload=debug,
|
||
log_level="info"
|
||
)
|
||
|