feat(dc): Complete Tool C quick action buttons Phase 1-2 - 7 functions
Summary: - Implement 7 quick action functions (filter, recode, binning, conditional, dropna, compute, pivot) - Refactor to pre-written Python functions architecture (stable and secure) - Add 7 Python operations modules with full type hints - Add 7 frontend Dialog components with user-friendly UI - Fix NaN serialization issues and auto type conversion - Update all related documentation Technical Details: - Python: operations/ module (filter.py, recode.py, binning.py, conditional.py, dropna.py, compute.py, pivot.py) - Backend: QuickActionService.ts with 7 execute methods - Frontend: 7 Dialog components with complete validation - Toolbar: Enable 7 quick action buttons Status: Phase 1-2 completed, basic testing passed, ready for further testing
This commit is contained in:
@@ -62,6 +62,15 @@ from services.docx_extractor import extract_docx_mammoth, validate_docx_file
|
||||
from services.txt_extractor import extract_txt, validate_txt_file
|
||||
from services.dc_executor import validate_code, execute_pandas_code
|
||||
|
||||
# ✨ 导入预写的数据操作函数
|
||||
from operations.filter import apply_filter
|
||||
from operations.recode import apply_recode
|
||||
from operations.binning import apply_binning
|
||||
from operations.conditional import apply_conditional_column, apply_simple_binning
|
||||
from operations.dropna import drop_missing_values, get_missing_summary
|
||||
from operations.compute import compute_column, get_formula_examples
|
||||
from operations.pivot import pivot_long_to_wide, get_pivot_preview
|
||||
|
||||
|
||||
# ==================== Pydantic Models ====================
|
||||
|
||||
@@ -74,6 +83,59 @@ class ExecuteCodeRequest(BaseModel):
|
||||
data: List[Dict[str, Any]]
|
||||
code: str
|
||||
|
||||
# ✨ 预写函数请求模型
|
||||
class FilterRequest(BaseModel):
|
||||
"""筛选请求模型"""
|
||||
data: List[Dict[str, Any]]
|
||||
conditions: List[Dict[str, Any]]
|
||||
logic: str = 'and'
|
||||
|
||||
class RecodeRequest(BaseModel):
|
||||
"""重编码请求模型"""
|
||||
data: List[Dict[str, Any]]
|
||||
column: str
|
||||
mapping: Dict[Any, Any]
|
||||
create_new_column: bool = True
|
||||
new_column_name: str = None
|
||||
|
||||
class BinningRequest(BaseModel):
|
||||
"""分箱请求模型"""
|
||||
data: List[Dict[str, Any]]
|
||||
column: str
|
||||
method: str
|
||||
new_column_name: str
|
||||
bins: List[Any] = None
|
||||
labels: List[Any] = None
|
||||
num_bins: int = 3
|
||||
|
||||
class ConditionalRequest(BaseModel):
|
||||
"""条件生成列请求模型"""
|
||||
data: List[Dict[str, Any]]
|
||||
new_column_name: str
|
||||
rules: List[Dict[str, Any]]
|
||||
else_value: Any = None
|
||||
|
||||
class DropnaRequest(BaseModel):
|
||||
"""删除缺失值请求模型"""
|
||||
data: List[Dict[str, Any]]
|
||||
method: str # 'row', 'column', 'both'
|
||||
threshold: float = 0.5
|
||||
subset: List[str] = None
|
||||
|
||||
class ComputeRequest(BaseModel):
|
||||
"""计算列请求模型"""
|
||||
data: List[Dict[str, Any]]
|
||||
new_column_name: str
|
||||
formula: str
|
||||
|
||||
class PivotRequest(BaseModel):
|
||||
"""Pivot请求模型"""
|
||||
data: List[Dict[str, Any]]
|
||||
index_column: str
|
||||
pivot_column: str
|
||||
value_columns: List[str]
|
||||
aggfunc: str = 'first'
|
||||
|
||||
|
||||
# ==================== API路由 ====================
|
||||
|
||||
@@ -592,6 +654,577 @@ async def execute_pandas_code_endpoint(request: ExecuteCodeRequest):
|
||||
)
|
||||
|
||||
|
||||
# ==================== ✨ 预写函数API端点 ====================
|
||||
|
||||
@app.post("/api/operations/filter")
|
||||
async def operation_filter(request: FilterRequest):
|
||||
"""
|
||||
高级筛选操作(预写函数)
|
||||
|
||||
Args:
|
||||
request: FilterRequest
|
||||
- data: List[Dict] # 输入数据
|
||||
- conditions: List[Dict] # 筛选条件
|
||||
- logic: str # 'and' 或 'or'
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"result_data": List[Dict],
|
||||
"output": str,
|
||||
"execution_time": float,
|
||||
"result_shape": [rows, cols]
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import time
|
||||
import io
|
||||
import sys
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 捕获打印输出
|
||||
captured_output = io.StringIO()
|
||||
sys.stdout = captured_output
|
||||
|
||||
try:
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# 调用预写函数
|
||||
result_df = apply_filter(df, request.conditions, request.logic)
|
||||
|
||||
# 转换回JSON(处理NaN和inf值)
|
||||
import numpy as np
|
||||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||||
result_data = result_df_clean.to_dict('records')
|
||||
|
||||
# 恢复stdout
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"筛选成功: {len(request.data)} → {len(result_data)} 行")
|
||||
|
||||
return JSONResponse(content={
|
||||
"success": True,
|
||||
"result_data": result_data,
|
||||
"output": output,
|
||||
"execution_time": execution_time,
|
||||
"result_shape": [len(result_data), len(result_df.columns)]
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
raise e
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"筛选操作失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time
|
||||
})
|
||||
|
||||
|
||||
@app.post("/api/operations/recode")
|
||||
async def operation_recode(request: RecodeRequest):
|
||||
"""
|
||||
数值映射(重编码)操作(预写函数)
|
||||
|
||||
Args:
|
||||
request: RecodeRequest
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"result_data": List[Dict],
|
||||
"output": str,
|
||||
"execution_time": float
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import time
|
||||
import io
|
||||
import sys
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 捕获打印输出
|
||||
captured_output = io.StringIO()
|
||||
sys.stdout = captured_output
|
||||
|
||||
try:
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# 调用预写函数
|
||||
result_df = apply_recode(
|
||||
df,
|
||||
request.column,
|
||||
request.mapping,
|
||||
request.create_new_column,
|
||||
request.new_column_name
|
||||
)
|
||||
|
||||
# 转换回JSON(处理NaN和inf值)
|
||||
import numpy as np
|
||||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||||
result_data = result_df_clean.to_dict('records')
|
||||
|
||||
# 恢复stdout
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"重编码成功: {request.column}")
|
||||
|
||||
return JSONResponse(content={
|
||||
"success": True,
|
||||
"result_data": result_data,
|
||||
"output": output,
|
||||
"execution_time": execution_time,
|
||||
"result_shape": [len(result_data), len(result_df.columns)]
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
raise e
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"重编码操作失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
})
|
||||
|
||||
|
||||
@app.post("/api/operations/binning")
|
||||
async def operation_binning(request: BinningRequest):
|
||||
"""
|
||||
生成分类变量(分箱)操作(预写函数)
|
||||
|
||||
Args:
|
||||
request: BinningRequest
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"result_data": List[Dict],
|
||||
"output": str,
|
||||
"execution_time": float
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import time
|
||||
import io
|
||||
import sys
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 捕获打印输出
|
||||
captured_output = io.StringIO()
|
||||
sys.stdout = captured_output
|
||||
|
||||
try:
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# 调用预写函数
|
||||
result_df = apply_binning(
|
||||
df,
|
||||
request.column,
|
||||
request.method,
|
||||
request.new_column_name,
|
||||
request.bins,
|
||||
request.labels,
|
||||
request.num_bins
|
||||
)
|
||||
|
||||
# 转换回JSON(处理Categorical类型和NaN值)
|
||||
# 1. 将Categorical列转为字符串
|
||||
for col in result_df.columns:
|
||||
if pd.api.types.is_categorical_dtype(result_df[col]):
|
||||
result_df[col] = result_df[col].astype(str)
|
||||
|
||||
# 2. 将NaN替换为None(避免JSON序列化错误)
|
||||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||||
result_data = result_df_clean.to_dict('records')
|
||||
|
||||
# 恢复stdout
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"分箱成功: {request.column} → {request.new_column_name}")
|
||||
|
||||
return JSONResponse(content={
|
||||
"success": True,
|
||||
"result_data": result_data,
|
||||
"output": output,
|
||||
"execution_time": execution_time,
|
||||
"result_shape": [len(result_data), len(result_df.columns)]
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
raise e
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"分箱操作失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
})
|
||||
|
||||
|
||||
@app.post("/api/operations/conditional")
|
||||
async def operation_conditional(request: ConditionalRequest):
|
||||
"""
|
||||
条件生成列操作(预写函数)
|
||||
|
||||
根据多条件IF-THEN-ELSE规则生成新列
|
||||
|
||||
Args:
|
||||
request: ConditionalRequest
|
||||
- data: 数据
|
||||
- new_column_name: 新列名称
|
||||
- rules: 规则列表,每个规则包含 conditions, logic, result
|
||||
- else_value: 默认值
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"result_data": List[Dict],
|
||||
"output": str,
|
||||
"execution_time": float
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import time
|
||||
import io
|
||||
import sys
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 捕获打印输出
|
||||
captured_output = io.StringIO()
|
||||
sys.stdout = captured_output
|
||||
|
||||
try:
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# 调用预写函数
|
||||
result_df = apply_conditional_column(
|
||||
df,
|
||||
request.new_column_name,
|
||||
request.rules,
|
||||
request.else_value
|
||||
)
|
||||
|
||||
# 转换回JSON(处理NaN值)
|
||||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||||
result_data = result_df_clean.to_dict('records')
|
||||
|
||||
# 恢复stdout
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"条件生成列成功: {request.new_column_name}")
|
||||
|
||||
return JSONResponse(content={
|
||||
"success": True,
|
||||
"result_data": result_data,
|
||||
"output": output,
|
||||
"execution_time": execution_time,
|
||||
"result_shape": [len(result_data), len(result_df.columns)]
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
raise e
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"条件生成列操作失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
@app.post("/api/operations/dropna")
|
||||
async def operation_dropna(request: DropnaRequest):
|
||||
"""
|
||||
删除缺失值操作(预写函数)
|
||||
|
||||
Args:
|
||||
request: DropnaRequest
|
||||
- data: 数据
|
||||
- method: 删除方式 ('row', 'column', 'both')
|
||||
- threshold: 缺失率阈值(0-1)
|
||||
- subset: 仅检查指定列(可选)
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"result_data": List[Dict],
|
||||
"output": str,
|
||||
"execution_time": float
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import time
|
||||
import io
|
||||
import sys
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 捕获打印输出
|
||||
captured_output = io.StringIO()
|
||||
sys.stdout = captured_output
|
||||
|
||||
try:
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# 调用预写函数
|
||||
result_df = drop_missing_values(
|
||||
df,
|
||||
method=request.method,
|
||||
threshold=request.threshold,
|
||||
subset=request.subset
|
||||
)
|
||||
|
||||
# 转换回JSON(处理NaN值)
|
||||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||||
result_data = result_df_clean.to_dict('records')
|
||||
|
||||
# 恢复stdout
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"删除缺失值成功: {request.method}")
|
||||
|
||||
return JSONResponse(content={
|
||||
"success": True,
|
||||
"result_data": result_data,
|
||||
"output": output,
|
||||
"execution_time": execution_time,
|
||||
"result_shape": [len(result_data), len(result_df.columns)]
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
raise e
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"删除缺失值操作失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
@app.post("/api/operations/compute")
|
||||
async def operation_compute(request: ComputeRequest):
|
||||
"""
|
||||
计算列操作(预写函数)
|
||||
|
||||
基于公式计算新列
|
||||
|
||||
Args:
|
||||
request: ComputeRequest
|
||||
- data: 数据
|
||||
- new_column_name: 新列名称
|
||||
- formula: 计算公式
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"result_data": List[Dict],
|
||||
"output": str,
|
||||
"execution_time": float
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import time
|
||||
import io
|
||||
import sys
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 捕获打印输出
|
||||
captured_output = io.StringIO()
|
||||
sys.stdout = captured_output
|
||||
|
||||
try:
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# 调用预写函数
|
||||
result_df = compute_column(
|
||||
df,
|
||||
request.new_column_name,
|
||||
request.formula
|
||||
)
|
||||
|
||||
# 转换回JSON(处理NaN值和inf值)
|
||||
import numpy as np
|
||||
# 1. 替换inf和-inf为None
|
||||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||||
# 2. 替换NaN为None
|
||||
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||||
# 3. 转换为dict(此时已经没有NaN和inf)
|
||||
result_data = result_df.to_dict('records')
|
||||
|
||||
# 恢复stdout
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"计算列成功: {request.new_column_name}")
|
||||
|
||||
# 使用json.dumps手动序列化(处理NaN)
|
||||
import json
|
||||
response_content = {
|
||||
"success": True,
|
||||
"result_data": result_data,
|
||||
"output": output,
|
||||
"execution_time": execution_time,
|
||||
"result_shape": [len(result_data), len(result_df.columns)]
|
||||
}
|
||||
|
||||
# 手动序列化,NaN会被转为null
|
||||
json_str = json.dumps(response_content, allow_nan=True)
|
||||
# 替换NaN为null
|
||||
json_str = json_str.replace('NaN', 'null').replace('Infinity', 'null').replace('-Infinity', 'null')
|
||||
|
||||
return JSONResponse(content=json.loads(json_str))
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
raise e
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"计算列操作失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
@app.post("/api/operations/pivot")
|
||||
async def operation_pivot(request: PivotRequest):
|
||||
"""
|
||||
Pivot操作:长表转宽表(预写函数)
|
||||
|
||||
将纵向重复数据转为横向数据
|
||||
|
||||
Args:
|
||||
request: PivotRequest
|
||||
- data: 数据
|
||||
- index_column: 索引列
|
||||
- pivot_column: 透视列
|
||||
- value_columns: 值列列表
|
||||
- aggfunc: 聚合函数
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"result_data": List[Dict],
|
||||
"output": str,
|
||||
"execution_time": float
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import time
|
||||
import io
|
||||
import sys
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 捕获打印输出
|
||||
captured_output = io.StringIO()
|
||||
sys.stdout = captured_output
|
||||
|
||||
try:
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# 调用预写函数
|
||||
result_df = pivot_long_to_wide(
|
||||
df,
|
||||
request.index_column,
|
||||
request.pivot_column,
|
||||
request.value_columns,
|
||||
request.aggfunc
|
||||
)
|
||||
|
||||
# 转换回JSON(处理NaN和inf值)
|
||||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||||
result_data = result_df_clean.to_dict('records')
|
||||
|
||||
# 恢复stdout
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"Pivot成功: {request.index_column} × {request.pivot_column}")
|
||||
|
||||
return JSONResponse(content={
|
||||
"success": True,
|
||||
"result_data": result_data,
|
||||
"output": output,
|
||||
"execution_time": execution_time,
|
||||
"result_shape": [len(result_data), len(result_df.columns)]
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
raise e
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Pivot操作失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
# ==================== 启动配置 ====================
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
16
extraction_service/operations/__init__.py
Normal file
16
extraction_service/operations/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
"""
|
||||
数据操作函数模块
|
||||
|
||||
提供预写的、经过测试的数据处理函数,供功能按钮调用。
|
||||
|
||||
模块列表:
|
||||
- filter: 高级筛选
|
||||
- recode: 数值映射(重编码)
|
||||
- binning: 生成分类变量(分箱)
|
||||
- conditional: 条件生成列
|
||||
- missing: 缺失值处理
|
||||
- duplicate: 去重
|
||||
"""
|
||||
|
||||
__version__ = '1.0.0'
|
||||
|
||||
152
extraction_service/operations/binning.py
Normal file
152
extraction_service/operations/binning.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""
|
||||
生成分类变量(分箱)操作
|
||||
|
||||
将连续数值变量转换为分类变量。
|
||||
支持三种方法:自定义切点、等宽分箱、等频分箱。
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import List, Optional, Literal, Union
|
||||
|
||||
|
||||
def apply_binning(
|
||||
df: pd.DataFrame,
|
||||
column: str,
|
||||
method: Literal['custom', 'equal_width', 'equal_freq'],
|
||||
new_column_name: str,
|
||||
bins: Optional[List[Union[int, float]]] = None,
|
||||
labels: Optional[List[Union[str, int]]] = None,
|
||||
num_bins: int = 3
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
应用分箱操作
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
column: 要分箱的列名
|
||||
method: 分箱方法
|
||||
- 'custom': 自定义切点
|
||||
- 'equal_width': 等宽分箱
|
||||
- 'equal_freq': 等频分箱
|
||||
new_column_name: 新列名
|
||||
bins: 自定义切点列表(仅method='custom'时使用),如 [18, 60] → <18, 18-60, >60
|
||||
labels: 标签列表(可选)
|
||||
num_bins: 分组数量(仅method='equal_width'或'equal_freq'时使用)
|
||||
|
||||
Returns:
|
||||
分箱后的数据框
|
||||
|
||||
Examples:
|
||||
>>> df = pd.DataFrame({'年龄': [15, 25, 35, 45, 55, 65, 75]})
|
||||
>>> result = apply_binning(df, '年龄', 'custom', '年龄分组',
|
||||
... bins=[18, 60], labels=['青少年', '成年', '老年'])
|
||||
>>> result['年龄分组'].tolist()
|
||||
['青少年', '成年', '成年', '成年', '成年', '老年', '老年']
|
||||
"""
|
||||
if df.empty:
|
||||
return df
|
||||
|
||||
# 验证列是否存在
|
||||
if column not in df.columns:
|
||||
raise KeyError(f"列 '{column}' 不存在")
|
||||
|
||||
# 创建结果数据框
|
||||
result = df.copy()
|
||||
|
||||
# 验证并转换数据类型
|
||||
if not pd.api.types.is_numeric_dtype(result[column]):
|
||||
# 尝试将字符串转换为数值
|
||||
try:
|
||||
result[column] = pd.to_numeric(result[column], errors='coerce')
|
||||
print(f"警告: 列 '{column}' 已自动转换为数值类型")
|
||||
except Exception as e:
|
||||
raise TypeError(f"列 '{column}' 不是数值类型且无法转换,无法进行分箱")
|
||||
|
||||
# 检查是否有有效的数值
|
||||
if result[column].isna().all():
|
||||
raise ValueError(f"列 '{column}' 中没有有效的数值,无法进行分箱")
|
||||
|
||||
# 根据方法进行分箱
|
||||
if method == 'custom':
|
||||
# 自定义切点(用户输入的是中间切点,需要自动添加边界)
|
||||
if not bins or len(bins) < 1:
|
||||
raise ValueError('自定义切点至少需要1个值')
|
||||
|
||||
# 验证切点是否升序
|
||||
if bins != sorted(bins):
|
||||
raise ValueError('切点必须按升序排列')
|
||||
|
||||
# 自动添加左右边界
|
||||
# 重要:始终添加边界,确保切点数+1=区间数
|
||||
min_val = result[column].min()
|
||||
max_val = result[column].max()
|
||||
|
||||
print(f'用户输入切点: {bins}')
|
||||
print(f'数据范围: [{min_val:.2f}, {max_val:.2f}]')
|
||||
|
||||
# 构建完整的边界数组:始终添加左右边界
|
||||
# 左边界:取min(用户第一个切点, 数据最小值) - 0.001
|
||||
# 右边界:取max(用户最后一个切点, 数据最大值) + 0.001
|
||||
left_bound = min(bins[0], min_val) - 0.001
|
||||
right_bound = max(bins[-1], max_val) + 0.001
|
||||
|
||||
full_bins = [left_bound] + bins + [right_bound]
|
||||
|
||||
print(f'完整边界: {[f"{b:.1f}" for b in full_bins]}')
|
||||
print(f'将生成 {len(full_bins) - 1} 个区间 = {len(bins) + 1} 个区间')
|
||||
|
||||
# 验证标签数量(区间数 = 边界数 - 1)
|
||||
expected_label_count = len(full_bins) - 1
|
||||
if labels and len(labels) != expected_label_count:
|
||||
raise ValueError(f'标签数量({len(labels)})必须等于区间数量({expected_label_count})')
|
||||
|
||||
result[new_column_name] = pd.cut(
|
||||
result[column],
|
||||
bins=full_bins,
|
||||
labels=labels,
|
||||
right=False,
|
||||
include_lowest=True
|
||||
)
|
||||
|
||||
elif method == 'equal_width':
|
||||
# 等宽分箱
|
||||
if num_bins < 2:
|
||||
raise ValueError('分组数量至少为2')
|
||||
|
||||
result[new_column_name] = pd.cut(
|
||||
result[column],
|
||||
bins=num_bins,
|
||||
labels=labels,
|
||||
include_lowest=True
|
||||
)
|
||||
|
||||
elif method == 'equal_freq':
|
||||
# 等频分箱
|
||||
if num_bins < 2:
|
||||
raise ValueError('分组数量至少为2')
|
||||
|
||||
result[new_column_name] = pd.qcut(
|
||||
result[column],
|
||||
q=num_bins,
|
||||
labels=labels,
|
||||
duplicates='drop' # 处理重复边界值
|
||||
)
|
||||
|
||||
else:
|
||||
raise ValueError(f"不支持的分箱方法: {method}")
|
||||
|
||||
# 统计分布
|
||||
print(f'分箱结果分布:')
|
||||
value_counts = result[new_column_name].value_counts().sort_index()
|
||||
for category, count in value_counts.items():
|
||||
percentage = count / len(result) * 100
|
||||
print(f' {category}: {count} 行 ({percentage:.1f}%)')
|
||||
|
||||
# 缺失值统计
|
||||
missing_count = result[new_column_name].isna().sum()
|
||||
if missing_count > 0:
|
||||
print(f'警告: {missing_count} 个值无法分箱(可能是缺失值或边界问题)')
|
||||
|
||||
return result
|
||||
|
||||
227
extraction_service/operations/compute.py
Normal file
227
extraction_service/operations/compute.py
Normal file
@@ -0,0 +1,227 @@
|
||||
"""
|
||||
计算列 - 预写函数
|
||||
基于公式计算新列,支持数学运算和常用函数
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import re
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
# 允许的函数(安全白名单)
|
||||
ALLOWED_FUNCTIONS = {
|
||||
'abs': abs,
|
||||
'round': round,
|
||||
'sqrt': np.sqrt,
|
||||
'log': np.log,
|
||||
'log10': np.log10,
|
||||
'exp': np.exp,
|
||||
'sin': np.sin,
|
||||
'cos': np.cos,
|
||||
'tan': np.tan,
|
||||
'floor': np.floor,
|
||||
'ceil': np.ceil,
|
||||
'min': min,
|
||||
'max': max,
|
||||
}
|
||||
|
||||
|
||||
def validate_formula(formula: str, available_columns: list) -> tuple[bool, str]:
|
||||
"""
|
||||
验证公式安全性和正确性
|
||||
|
||||
Args:
|
||||
formula: 公式字符串
|
||||
available_columns: 可用的列名列表
|
||||
|
||||
Returns:
|
||||
(is_valid, error_message)
|
||||
"""
|
||||
# 检查是否为空
|
||||
if not formula or not formula.strip():
|
||||
return False, '公式不能为空'
|
||||
|
||||
# 检查危险操作
|
||||
dangerous_patterns = [
|
||||
r'__', # 双下划线(Python内部属性)
|
||||
r'import\s', # import语句
|
||||
r'exec\s', # exec函数
|
||||
r'eval\s', # eval函数
|
||||
r'open\s*\(', # 文件操作
|
||||
r'compile\s*\(', # 编译函数
|
||||
r'globals\s*\(', # 全局变量
|
||||
r'locals\s*\(', # 局部变量
|
||||
r'__builtins__', # 内置函数
|
||||
]
|
||||
|
||||
for pattern in dangerous_patterns:
|
||||
if re.search(pattern, formula, re.IGNORECASE):
|
||||
return False, f'公式包含不允许的操作: {pattern}'
|
||||
|
||||
# 检查是否只包含允许的字符
|
||||
allowed_chars = r'[a-zA-Z0-9_\u4e00-\u9fa5\s\+\-\*/\(\)\.,\*\*]'
|
||||
if not re.match(f'^{allowed_chars}+$', formula):
|
||||
return False, '公式包含不允许的字符'
|
||||
|
||||
return True, ''
|
||||
|
||||
|
||||
def compute_column(
|
||||
df: pd.DataFrame,
|
||||
new_column_name: str,
|
||||
formula: str
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
基于公式计算新列
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
new_column_name: 新列名称
|
||||
formula: 计算公式
|
||||
- 支持列名引用(如:身高, 体重)
|
||||
- 支持运算符(+, -, *, /, **, %)
|
||||
- 支持函数(abs, round, sqrt, log, exp等)
|
||||
|
||||
Returns:
|
||||
添加了新列的数据框
|
||||
|
||||
示例:
|
||||
# BMI计算
|
||||
compute_column(df, 'BMI', '体重 / (身高/100)**2')
|
||||
|
||||
# 年龄平方根
|
||||
compute_column(df, '年龄_sqrt', 'sqrt(年龄)')
|
||||
|
||||
# 复杂公式
|
||||
compute_column(df, '综合得分', '(FMA*0.6 + ADL*0.4) / 100')
|
||||
"""
|
||||
result = df.copy()
|
||||
|
||||
print(f'计算新列: {new_column_name}')
|
||||
print(f'公式: {formula}')
|
||||
print('')
|
||||
|
||||
# 验证公式
|
||||
is_valid, error_msg = validate_formula(formula, list(result.columns))
|
||||
if not is_valid:
|
||||
raise ValueError(f'公式验证失败: {error_msg}')
|
||||
|
||||
# 准备执行环境
|
||||
# 1. 添加数据框的列作为变量(自动转换数值类型)
|
||||
env = {}
|
||||
for col in result.columns:
|
||||
# 尝试将列转换为数值类型
|
||||
try:
|
||||
# 如果列可以转换为数值,就转换
|
||||
numeric_col = pd.to_numeric(result[col], errors='coerce')
|
||||
# 如果转换后不全是NaN,说明是数值列
|
||||
if not numeric_col.isna().all():
|
||||
env[col] = numeric_col
|
||||
print(f' 列 "{col}" 自动转换为数值类型')
|
||||
else:
|
||||
# 否则保持原样
|
||||
env[col] = result[col]
|
||||
except Exception:
|
||||
# 转换失败,保持原样
|
||||
env[col] = result[col]
|
||||
|
||||
# 2. 添加允许的函数
|
||||
env.update(ALLOWED_FUNCTIONS)
|
||||
|
||||
# 3. 添加numpy(用于数学运算)
|
||||
env['np'] = np
|
||||
|
||||
try:
|
||||
# 执行公式计算
|
||||
result[new_column_name] = eval(formula, {"__builtins__": {}}, env)
|
||||
|
||||
print(f'计算成功!')
|
||||
print(f'新列类型: {result[new_column_name].dtype}')
|
||||
print(f'新列前5个值:')
|
||||
# 安全打印(避免NaN/inf导致序列化错误)
|
||||
for idx, val in result[new_column_name].head().items():
|
||||
if pd.isna(val):
|
||||
print(f' [{idx}] None (NaN)')
|
||||
elif np.isinf(val):
|
||||
print(f' [{idx}] None (inf)')
|
||||
else:
|
||||
print(f' [{idx}] {val}')
|
||||
print('')
|
||||
|
||||
# 统计结果
|
||||
if pd.api.types.is_numeric_dtype(result[new_column_name]):
|
||||
col_data = result[new_column_name]
|
||||
|
||||
# 统计缺失值和无效值
|
||||
nan_count = col_data.isna().sum()
|
||||
inf_count = np.isinf(col_data.replace([np.nan], 0)).sum()
|
||||
|
||||
print(f'统计信息:')
|
||||
|
||||
# 只对有效值计算统计量
|
||||
valid_data = col_data.dropna().replace([np.inf, -np.inf], np.nan).dropna()
|
||||
|
||||
if len(valid_data) > 0:
|
||||
print(f' 最小值: {valid_data.min():.2f}')
|
||||
print(f' 最大值: {valid_data.max():.2f}')
|
||||
print(f' 平均值: {valid_data.mean():.2f}')
|
||||
else:
|
||||
print(f' 没有有效的数值')
|
||||
|
||||
if nan_count > 0:
|
||||
print(f' 缺失值(NaN): {nan_count} 个')
|
||||
if inf_count > 0:
|
||||
print(f' 无穷大值(inf): {inf_count} 个')
|
||||
else:
|
||||
print(f'非数值类型,跳过统计')
|
||||
|
||||
return result
|
||||
|
||||
except NameError as e:
|
||||
# 列名不存在
|
||||
missing_col = str(e).split("'")[1]
|
||||
raise ValueError(f'列 "{missing_col}" 不存在,请检查公式中的列名')
|
||||
|
||||
except ZeroDivisionError:
|
||||
raise ValueError('除零错误:公式中存在除以0的情况')
|
||||
|
||||
except Exception as e:
|
||||
raise ValueError(f'计算失败: {str(e)}')
|
||||
|
||||
|
||||
def get_formula_examples() -> list[Dict[str, str]]:
|
||||
"""
|
||||
获取公式示例
|
||||
|
||||
Returns:
|
||||
示例列表
|
||||
"""
|
||||
return [
|
||||
{
|
||||
'name': 'BMI计算',
|
||||
'formula': '体重 / (身高/100)**2',
|
||||
'description': '体重指数(需要身高(cm)和体重(kg)列)'
|
||||
},
|
||||
{
|
||||
'name': '年龄分组',
|
||||
'formula': 'round(年龄 / 10) * 10',
|
||||
'description': '按10岁为一组(20, 30, 40...)'
|
||||
},
|
||||
{
|
||||
'name': '综合得分',
|
||||
'formula': '(FMA得分 * 0.6 + ADL得分 * 0.4)',
|
||||
'description': '加权平均分'
|
||||
},
|
||||
{
|
||||
'name': '变化率',
|
||||
'formula': '(随访值 - 基线值) / 基线值 * 100',
|
||||
'description': '计算变化百分比'
|
||||
},
|
||||
{
|
||||
'name': '对数转换',
|
||||
'formula': 'log(值 + 1)',
|
||||
'description': '对数变换(处理偏态分布)'
|
||||
},
|
||||
]
|
||||
|
||||
188
extraction_service/operations/conditional.py
Normal file
188
extraction_service/operations/conditional.py
Normal file
@@ -0,0 +1,188 @@
|
||||
"""
|
||||
条件生成列 - 预写函数
|
||||
支持复杂的IF-THEN-ELSE多条件逻辑
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import List, Dict, Any, Union
|
||||
|
||||
|
||||
def apply_conditional_column(
|
||||
df: pd.DataFrame,
|
||||
new_column_name: str,
|
||||
rules: List[Dict[str, Any]],
|
||||
else_value: Any = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
根据多条件规则生成新列
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
new_column_name: 新列名称
|
||||
rules: 规则列表,每个规则包含:
|
||||
- conditions: 条件列表
|
||||
- logic: 'and' 或 'or'
|
||||
- result: 满足条件时的结果值
|
||||
else_value: 所有规则都不满足时的默认值
|
||||
|
||||
Returns:
|
||||
添加了新列的数据框
|
||||
|
||||
示例:
|
||||
rules = [
|
||||
{
|
||||
"conditions": [
|
||||
{"column": "年龄", "operator": ">=", "value": 60}
|
||||
],
|
||||
"logic": "and",
|
||||
"result": "老年"
|
||||
},
|
||||
{
|
||||
"conditions": [
|
||||
{"column": "年龄", "operator": ">=", "value": 18},
|
||||
{"column": "年龄", "operator": "<", "value": 60}
|
||||
],
|
||||
"logic": "and",
|
||||
"result": "成年"
|
||||
}
|
||||
]
|
||||
"""
|
||||
result = df.copy()
|
||||
|
||||
# 验证规则
|
||||
if not rules or len(rules) == 0:
|
||||
raise ValueError('至少需要1条规则')
|
||||
|
||||
# 验证所有引用的列是否存在
|
||||
for rule in rules:
|
||||
for condition in rule.get('conditions', []):
|
||||
column = condition.get('column')
|
||||
if column not in result.columns:
|
||||
raise ValueError(f'列 "{column}" 不存在')
|
||||
|
||||
# 初始化新列为else_value
|
||||
result[new_column_name] = else_value
|
||||
|
||||
print(f'开始应用条件规则,共 {len(rules)} 条规则')
|
||||
|
||||
# 按顺序应用每条规则
|
||||
for rule_idx, rule in enumerate(rules, 1):
|
||||
conditions = rule.get('conditions', [])
|
||||
logic = rule.get('logic', 'and')
|
||||
result_value = rule.get('result')
|
||||
|
||||
if not conditions:
|
||||
continue
|
||||
|
||||
# 构建每个条件的mask
|
||||
masks = []
|
||||
for condition in conditions:
|
||||
column = condition['column']
|
||||
operator = condition['operator']
|
||||
value = condition['value']
|
||||
|
||||
# 智能类型转换:对于数字比较运算符,尝试将列转换为数字
|
||||
if operator in ('>', '<', '>=', '<='):
|
||||
# 尝试将列转换为数字类型
|
||||
try:
|
||||
col_data = pd.to_numeric(result[column], errors='coerce')
|
||||
# 确保value也是数字
|
||||
if not isinstance(value, (int, float)):
|
||||
value = float(value)
|
||||
except Exception:
|
||||
# 如果转换失败,使用原始数据
|
||||
col_data = result[column]
|
||||
else:
|
||||
# 对于相等/不相等比较,使用原始数据
|
||||
col_data = result[column]
|
||||
|
||||
# 根据运算符生成mask
|
||||
if operator == '=':
|
||||
mask = col_data == value
|
||||
elif operator == '!=':
|
||||
mask = col_data != value
|
||||
elif operator == '>':
|
||||
mask = col_data > value
|
||||
elif operator == '<':
|
||||
mask = col_data < value
|
||||
elif operator == '>=':
|
||||
mask = col_data >= value
|
||||
elif operator == '<=':
|
||||
mask = col_data <= value
|
||||
else:
|
||||
raise ValueError(f'不支持的运算符: {operator}')
|
||||
|
||||
masks.append(mask)
|
||||
|
||||
# 组合条件
|
||||
if logic == 'and':
|
||||
final_mask = pd.concat(masks, axis=1).all(axis=1)
|
||||
elif logic == 'or':
|
||||
final_mask = pd.concat(masks, axis=1).any(axis=1)
|
||||
else:
|
||||
raise ValueError(f'不支持的逻辑运算符: {logic}')
|
||||
|
||||
# 应用规则
|
||||
matched_count = final_mask.sum()
|
||||
result.loc[final_mask, new_column_name] = result_value
|
||||
|
||||
print(f' 规则{rule_idx}: 匹配 {matched_count} 行 → 值为 {result_value}')
|
||||
|
||||
# 统计结果分布
|
||||
print(f'\n结果分布:')
|
||||
value_counts = result[new_column_name].value_counts(dropna=False)
|
||||
for value, count in value_counts.items():
|
||||
percentage = count / len(result) * 100
|
||||
if pd.isna(value):
|
||||
print(f' (空值): {count} 行 ({percentage:.1f}%)')
|
||||
else:
|
||||
print(f' {value}: {count} 行 ({percentage:.1f}%)')
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def apply_simple_binning(
|
||||
df: pd.DataFrame,
|
||||
column: str,
|
||||
new_column_name: str,
|
||||
threshold: float,
|
||||
value_if_true: Any = 1,
|
||||
value_if_false: Any = 0
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
简单二分类(单一阈值判断)
|
||||
|
||||
这是条件生成列的简化版本,用于单一阈值判断
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
column: 用于判断的列
|
||||
new_column_name: 新列名称
|
||||
threshold: 阈值
|
||||
value_if_true: >= threshold时的值
|
||||
value_if_false: < threshold时的值
|
||||
|
||||
Returns:
|
||||
添加了新列的数据框
|
||||
|
||||
示例:
|
||||
住院患者暴露分组:
|
||||
督脉针刺持续时间 >= 10 → 1 (暴露)
|
||||
督脉针刺持续时间 < 10 → 0 (非暴露)
|
||||
"""
|
||||
result = df.copy()
|
||||
|
||||
if column not in result.columns:
|
||||
raise ValueError(f'列 "{column}" 不存在')
|
||||
|
||||
# 简单的阈值判断
|
||||
result[new_column_name] = (result[column] >= threshold).astype(int) * value_if_true + \
|
||||
(result[column] < threshold).astype(int) * value_if_false
|
||||
|
||||
# 统计分布
|
||||
print(f'简单二分类结果:')
|
||||
print(f' {column} >= {threshold}: {(result[column] >= threshold).sum()} 行 → {value_if_true}')
|
||||
print(f' {column} < {threshold}: {(result[column] < threshold).sum()} 行 → {value_if_false}')
|
||||
|
||||
return result
|
||||
|
||||
149
extraction_service/operations/dropna.py
Normal file
149
extraction_service/operations/dropna.py
Normal file
@@ -0,0 +1,149 @@
|
||||
"""
|
||||
删除缺失值 - 预写函数
|
||||
支持按行删除、按列删除、阈值控制
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import Literal, Optional, List
|
||||
|
||||
|
||||
def drop_missing_values(
|
||||
df: pd.DataFrame,
|
||||
method: Literal['row', 'column', 'both'] = 'row',
|
||||
threshold: Optional[float] = None,
|
||||
subset: Optional[List[str]] = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
删除缺失值
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
method: 删除方式
|
||||
- 'row': 删除包含缺失值的行
|
||||
- 'column': 删除缺失值过多的列
|
||||
- 'both': 先删除列,再删除行
|
||||
threshold: 缺失率阈值(0-1之间),仅对'column'和'both'有效
|
||||
- 如果列的缺失率超过此阈值,则删除该列
|
||||
- 默认为0.5(50%)
|
||||
subset: 仅检查指定列的缺失值(仅对'row'有效)
|
||||
|
||||
Returns:
|
||||
删除缺失值后的数据框
|
||||
|
||||
示例:
|
||||
# 删除包含任何缺失值的行
|
||||
drop_missing_values(df, method='row')
|
||||
|
||||
# 删除缺失率>30%的列
|
||||
drop_missing_values(df, method='column', threshold=0.3)
|
||||
|
||||
# 先删除缺失列,再删除缺失行
|
||||
drop_missing_values(df, method='both', threshold=0.5)
|
||||
|
||||
# 仅检查指定列
|
||||
drop_missing_values(df, method='row', subset=['年龄', 'BMI'])
|
||||
"""
|
||||
result = df.copy()
|
||||
original_shape = result.shape
|
||||
|
||||
print(f'原始数据: {original_shape[0]} 行 × {original_shape[1]} 列')
|
||||
print(f'缺失值总数: {result.isna().sum().sum()}')
|
||||
print('')
|
||||
|
||||
# 默认阈值
|
||||
if threshold is None:
|
||||
threshold = 0.5
|
||||
|
||||
# 按列删除
|
||||
if method in ('column', 'both'):
|
||||
# 计算每列的缺失率
|
||||
missing_rate = result.isna().sum() / len(result)
|
||||
cols_to_drop = missing_rate[missing_rate > threshold].index.tolist()
|
||||
|
||||
if cols_to_drop:
|
||||
print(f'检测到缺失率>{threshold*100:.0f}%的列: {len(cols_to_drop)}个')
|
||||
for col in cols_to_drop:
|
||||
rate = missing_rate[col]
|
||||
count = result[col].isna().sum()
|
||||
print(f' - {col}: 缺失率={rate*100:.1f}% ({count}/{len(result)})')
|
||||
|
||||
result = result.drop(columns=cols_to_drop)
|
||||
print(f'删除后: {result.shape[0]} 行 × {result.shape[1]} 列')
|
||||
print('')
|
||||
else:
|
||||
print(f'没有找到缺失率>{threshold*100:.0f}%的列')
|
||||
print('')
|
||||
|
||||
# 按行删除
|
||||
if method in ('row', 'both'):
|
||||
before_rows = len(result)
|
||||
|
||||
if subset:
|
||||
# 仅检查指定列
|
||||
print(f'仅检查指定列的缺失值: {subset}')
|
||||
result = result.dropna(subset=subset)
|
||||
else:
|
||||
# 检查所有列
|
||||
result = result.dropna()
|
||||
|
||||
dropped_rows = before_rows - len(result)
|
||||
if dropped_rows > 0:
|
||||
print(f'删除了 {dropped_rows} 行(包含缺失值的行)')
|
||||
print(f'保留了 {len(result)} 行({len(result)/before_rows*100:.1f}%)')
|
||||
else:
|
||||
print('没有找到包含缺失值的行')
|
||||
print('')
|
||||
|
||||
# 最终统计
|
||||
final_shape = result.shape
|
||||
print(f'最终结果: {final_shape[0]} 行 × {final_shape[1]} 列')
|
||||
print(f'删除了 {original_shape[0] - final_shape[0]} 行')
|
||||
print(f'删除了 {original_shape[1] - final_shape[1]} 列')
|
||||
print(f'剩余缺失值: {result.isna().sum().sum()}')
|
||||
|
||||
# 如果结果为空,给出警告
|
||||
if len(result) == 0:
|
||||
print('\n⚠️ 警告: 删除后数据为空!')
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_missing_summary(df: pd.DataFrame) -> dict:
|
||||
"""
|
||||
获取缺失值统计摘要
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
|
||||
Returns:
|
||||
缺失值统计信息
|
||||
"""
|
||||
total_cells = df.shape[0] * df.shape[1]
|
||||
total_missing = df.isna().sum().sum()
|
||||
|
||||
# 按列统计
|
||||
col_missing = df.isna().sum()
|
||||
col_missing_rate = col_missing / len(df)
|
||||
|
||||
cols_with_missing = col_missing[col_missing > 0].to_dict()
|
||||
cols_missing_rate = col_missing_rate[col_missing > 0].to_dict()
|
||||
|
||||
# 按行统计
|
||||
row_missing = df.isna().sum(axis=1)
|
||||
rows_with_missing = (row_missing > 0).sum()
|
||||
|
||||
return {
|
||||
'total_cells': total_cells,
|
||||
'total_missing': int(total_missing),
|
||||
'missing_rate': total_missing / total_cells if total_cells > 0 else 0,
|
||||
'rows_with_missing': int(rows_with_missing),
|
||||
'cols_with_missing': len(cols_with_missing),
|
||||
'col_missing_detail': {
|
||||
col: {
|
||||
'count': int(count),
|
||||
'rate': float(cols_missing_rate[col])
|
||||
}
|
||||
for col, count in cols_with_missing.items()
|
||||
}
|
||||
}
|
||||
|
||||
109
extraction_service/operations/filter.py
Normal file
109
extraction_service/operations/filter.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""
|
||||
高级筛选操作
|
||||
|
||||
提供多条件筛选功能,支持AND/OR逻辑组合。
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import List, Dict, Any, Literal
|
||||
|
||||
|
||||
def apply_filter(
|
||||
df: pd.DataFrame,
|
||||
conditions: List[Dict[str, Any]],
|
||||
logic: Literal['and', 'or'] = 'and'
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
应用筛选条件
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
conditions: 筛选条件列表,每个条件包含:
|
||||
- column: 列名
|
||||
- operator: 运算符 (=, !=, >, <, >=, <=, contains, not_contains,
|
||||
starts_with, ends_with, is_null, not_null)
|
||||
- value: 值(is_null和not_null不需要)
|
||||
logic: 逻辑组合方式 ('and' 或 'or')
|
||||
|
||||
Returns:
|
||||
筛选后的数据框
|
||||
|
||||
Examples:
|
||||
>>> df = pd.DataFrame({'年龄': [25, 35, 45], '性别': ['男', '女', '男']})
|
||||
>>> conditions = [
|
||||
... {'column': '年龄', 'operator': '>', 'value': 30},
|
||||
... {'column': '性别', 'operator': '=', 'value': '男'}
|
||||
... ]
|
||||
>>> result = apply_filter(df, conditions, logic='and')
|
||||
>>> len(result)
|
||||
1
|
||||
"""
|
||||
if not conditions:
|
||||
raise ValueError('筛选条件不能为空')
|
||||
|
||||
if df.empty:
|
||||
return df
|
||||
|
||||
# 生成各个条件的mask
|
||||
masks = []
|
||||
for cond in conditions:
|
||||
column = cond['column']
|
||||
operator = cond['operator']
|
||||
value = cond.get('value')
|
||||
|
||||
# 验证列是否存在
|
||||
if column not in df.columns:
|
||||
raise KeyError(f"列 '{column}' 不存在")
|
||||
|
||||
# 根据运算符生成mask
|
||||
if operator == '=':
|
||||
mask = df[column] == value
|
||||
elif operator == '!=':
|
||||
mask = df[column] != value
|
||||
elif operator == '>':
|
||||
mask = df[column] > value
|
||||
elif operator == '<':
|
||||
mask = df[column] < value
|
||||
elif operator == '>=':
|
||||
mask = df[column] >= value
|
||||
elif operator == '<=':
|
||||
mask = df[column] <= value
|
||||
elif operator == 'contains':
|
||||
mask = df[column].astype(str).str.contains(str(value), na=False)
|
||||
elif operator == 'not_contains':
|
||||
mask = ~df[column].astype(str).str.contains(str(value), na=False)
|
||||
elif operator == 'starts_with':
|
||||
mask = df[column].astype(str).str.startswith(str(value), na=False)
|
||||
elif operator == 'ends_with':
|
||||
mask = df[column].astype(str).str.endswith(str(value), na=False)
|
||||
elif operator == 'is_null':
|
||||
mask = df[column].isna()
|
||||
elif operator == 'not_null':
|
||||
mask = df[column].notna()
|
||||
else:
|
||||
raise ValueError(f"不支持的运算符: {operator}")
|
||||
|
||||
masks.append(mask)
|
||||
|
||||
# 组合所有条件
|
||||
if logic == 'and':
|
||||
final_mask = pd.concat(masks, axis=1).all(axis=1)
|
||||
elif logic == 'or':
|
||||
final_mask = pd.concat(masks, axis=1).any(axis=1)
|
||||
else:
|
||||
raise ValueError(f"不支持的逻辑运算: {logic}")
|
||||
|
||||
# 应用筛选
|
||||
result = df[final_mask].copy()
|
||||
|
||||
# 打印统计信息
|
||||
original_rows = len(df)
|
||||
filtered_rows = len(result)
|
||||
removed_rows = original_rows - filtered_rows
|
||||
|
||||
print(f'原始数据: {original_rows} 行')
|
||||
print(f'筛选后: {filtered_rows} 行')
|
||||
print(f'删除: {removed_rows} 行 ({removed_rows/original_rows*100:.1f}%)')
|
||||
|
||||
return result
|
||||
|
||||
161
extraction_service/operations/pivot.py
Normal file
161
extraction_service/operations/pivot.py
Normal file
@@ -0,0 +1,161 @@
|
||||
"""
|
||||
Pivot操作 - 预写函数
|
||||
长表转宽表(一人多行 → 一人一行)
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import List, Literal, Optional
|
||||
|
||||
|
||||
def pivot_long_to_wide(
|
||||
df: pd.DataFrame,
|
||||
index_column: str,
|
||||
pivot_column: str,
|
||||
value_columns: List[str],
|
||||
aggfunc: Literal['first', 'last', 'mean', 'sum', 'min', 'max'] = 'first'
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
长表转宽表(Pivot)
|
||||
|
||||
将纵向重复的数据转为横向数据
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
index_column: 索引列(唯一标识,如 Record ID)
|
||||
pivot_column: 透视列(将变成新列名的列,如 Event Name)
|
||||
value_columns: 值列(要转置的数据列,如 FMA得分, ADL得分)
|
||||
aggfunc: 聚合函数
|
||||
- 'first': 取第一个值(推荐)
|
||||
- 'last': 取最后一个值
|
||||
- 'mean': 求平均值
|
||||
- 'sum': 求和
|
||||
- 'min': 取最小值
|
||||
- 'max': 取最大值
|
||||
|
||||
Returns:
|
||||
宽表数据框
|
||||
|
||||
示例:
|
||||
pivot_long_to_wide(
|
||||
df,
|
||||
index_column='Record ID',
|
||||
pivot_column='Event Name',
|
||||
value_columns=['FMA得分', 'ADL得分'],
|
||||
aggfunc='first'
|
||||
)
|
||||
"""
|
||||
result = df.copy()
|
||||
|
||||
print(f'原始数据: {len(result)} 行 × {len(result.columns)} 列')
|
||||
print(f'索引列: {index_column}')
|
||||
print(f'透视列: {pivot_column}')
|
||||
print(f'值列: {", ".join(value_columns)}')
|
||||
print(f'聚合方式: {aggfunc}')
|
||||
print('')
|
||||
|
||||
# 验证列是否存在
|
||||
required_cols = [index_column, pivot_column] + value_columns
|
||||
missing_cols = [col for col in required_cols if col not in result.columns]
|
||||
if missing_cols:
|
||||
raise ValueError(f'以下列不存在: {", ".join(missing_cols)}')
|
||||
|
||||
# 检查索引列的唯一值数量
|
||||
unique_index = result[index_column].nunique()
|
||||
print(f'唯一{index_column}数量: {unique_index}')
|
||||
|
||||
# 检查透视列的唯一值
|
||||
unique_pivot = result[pivot_column].unique()
|
||||
print(f'透视列"{pivot_column}"的唯一值: {list(unique_pivot)}')
|
||||
print('')
|
||||
|
||||
try:
|
||||
# 执行Pivot转换
|
||||
df_pivot = result.pivot_table(
|
||||
index=index_column,
|
||||
columns=pivot_column,
|
||||
values=value_columns,
|
||||
aggfunc=aggfunc
|
||||
)
|
||||
|
||||
# 展平多级列名
|
||||
# 如果只有一个值列,列名是单层的
|
||||
if len(value_columns) == 1:
|
||||
df_pivot.columns = [f'{value_columns[0]}_{col}' for col in df_pivot.columns]
|
||||
else:
|
||||
# 多个值列,列名是多层的,需要展平
|
||||
df_pivot.columns = ['_'.join(str(c) for c in col).strip() for col in df_pivot.columns.values]
|
||||
|
||||
# 重置索引(将index列变回普通列)
|
||||
df_pivot = df_pivot.reset_index()
|
||||
|
||||
print(f'转换成功!')
|
||||
print(f'结果: {len(df_pivot)} 行 × {len(df_pivot.columns)} 列')
|
||||
print(f'新增列: {len(df_pivot.columns) - 1} 列')
|
||||
print('')
|
||||
|
||||
# 显示新列名
|
||||
print(f'生成的列名:')
|
||||
new_cols = [col for col in df_pivot.columns if col != index_column]
|
||||
for i, col in enumerate(new_cols[:10], 1): # 只显示前10个
|
||||
print(f' {i}. {col}')
|
||||
if len(new_cols) > 10:
|
||||
print(f' ... 还有 {len(new_cols) - 10} 列')
|
||||
|
||||
return df_pivot
|
||||
|
||||
except ValueError as e:
|
||||
# Pivot失败(可能有重复的index+pivot组合)
|
||||
if 'Index contains duplicate entries' in str(e):
|
||||
# 统计重复情况
|
||||
duplicates = result.groupby([index_column, pivot_column]).size()
|
||||
duplicates = duplicates[duplicates > 1]
|
||||
|
||||
print('⚠️ 警告: 发现重复的索引+透视组合:')
|
||||
for (idx, piv), count in duplicates.head(5).items():
|
||||
print(f' {index_column}={idx}, {pivot_column}={piv}: {count}次')
|
||||
|
||||
if len(duplicates) > 5:
|
||||
print(f' ... 还有 {len(duplicates) - 5} 个重复组合')
|
||||
|
||||
print(f'\n建议: 使用聚合函数(如mean、sum)处理重复值')
|
||||
print(f'当前聚合方式: {aggfunc}')
|
||||
|
||||
raise ValueError(f'存在重复的{index_column}+{pivot_column}组合,需要选择合适的聚合方式')
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
||||
def get_pivot_preview(
|
||||
df: pd.DataFrame,
|
||||
index_column: str,
|
||||
pivot_column: str
|
||||
) -> dict:
|
||||
"""
|
||||
获取Pivot预览信息
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
index_column: 索引列
|
||||
pivot_column: 透视列
|
||||
|
||||
Returns:
|
||||
预览信息
|
||||
"""
|
||||
# 统计唯一值
|
||||
unique_index = df[index_column].nunique()
|
||||
unique_pivot = df[pivot_column].unique()
|
||||
|
||||
# 检查是否有重复
|
||||
duplicates = df.groupby([index_column, pivot_column]).size()
|
||||
has_duplicates = (duplicates > 1).any()
|
||||
duplicate_count = (duplicates > 1).sum() if has_duplicates else 0
|
||||
|
||||
return {
|
||||
'unique_index_count': int(unique_index),
|
||||
'unique_pivot_values': [str(v) for v in unique_pivot],
|
||||
'has_duplicates': bool(has_duplicates),
|
||||
'duplicate_count': int(duplicate_count),
|
||||
'estimated_rows': int(unique_index),
|
||||
'estimated_columns': len(unique_pivot)
|
||||
}
|
||||
|
||||
79
extraction_service/operations/recode.py
Normal file
79
extraction_service/operations/recode.py
Normal file
@@ -0,0 +1,79 @@
|
||||
"""
|
||||
数值映射(重编码)操作
|
||||
|
||||
将分类变量的原始值映射为新值(如:男→1,女→2)。
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
|
||||
def apply_recode(
|
||||
df: pd.DataFrame,
|
||||
column: str,
|
||||
mapping: Dict[Any, Any],
|
||||
create_new_column: bool = True,
|
||||
new_column_name: Optional[str] = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
应用数值映射
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
column: 要重编码的列名
|
||||
mapping: 映射字典,如 {'男': 1, '女': 2}
|
||||
create_new_column: 是否创建新列(True)或覆盖原列(False)
|
||||
new_column_name: 新列名(create_new_column=True时使用)
|
||||
|
||||
Returns:
|
||||
重编码后的数据框
|
||||
|
||||
Examples:
|
||||
>>> df = pd.DataFrame({'性别': ['男', '女', '男', '女']})
|
||||
>>> mapping = {'男': 1, '女': 2}
|
||||
>>> result = apply_recode(df, '性别', mapping, True, '性别_编码')
|
||||
>>> result['性别_编码'].tolist()
|
||||
[1, 2, 1, 2]
|
||||
"""
|
||||
if df.empty:
|
||||
return df
|
||||
|
||||
# 验证列是否存在
|
||||
if column not in df.columns:
|
||||
raise KeyError(f"列 '{column}' 不存在")
|
||||
|
||||
if not mapping:
|
||||
raise ValueError('映射字典不能为空')
|
||||
|
||||
# 确定目标列名
|
||||
if create_new_column:
|
||||
target_column = new_column_name or f'{column}_编码'
|
||||
else:
|
||||
target_column = column
|
||||
|
||||
# 创建结果数据框(避免修改原数据)
|
||||
result = df.copy()
|
||||
|
||||
# 应用映射
|
||||
result[target_column] = result[column].map(mapping)
|
||||
|
||||
# 统计结果
|
||||
mapped_count = result[target_column].notna().sum()
|
||||
unmapped_count = result[target_column].isna().sum()
|
||||
total_count = len(result)
|
||||
|
||||
print(f'映射完成: {mapped_count} 个值成功映射')
|
||||
|
||||
if unmapped_count > 0:
|
||||
print(f'警告: {unmapped_count} 个值未找到对应映射')
|
||||
# 找出未映射的唯一值
|
||||
unmapped_mask = result[target_column].isna()
|
||||
unmapped_values = result.loc[unmapped_mask, column].unique()
|
||||
print(f'未映射的值: {list(unmapped_values)[:10]}') # 最多显示10个
|
||||
|
||||
# 映射成功率
|
||||
success_rate = (mapped_count / total_count * 100) if total_count > 0 else 0
|
||||
print(f'映射成功率: {success_rate:.1f}%')
|
||||
|
||||
return result
|
||||
|
||||
@@ -279,3 +279,5 @@ if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -45,3 +45,5 @@ except Exception as e:
|
||||
print(f"\n❌ 测试异常: {str(e)}")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -25,3 +25,5 @@ except Exception as e:
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user