feat(dc): Complete Tool C quick action buttons Phase 1-2 - 7 functions

Summary:
- Implement 7 quick action functions (filter, recode, binning, conditional, dropna, compute, pivot)
- Refactor to pre-written Python functions architecture (stable and secure)
- Add 7 Python operations modules with full type hints
- Add 7 frontend Dialog components with user-friendly UI
- Fix NaN serialization issues and auto type conversion
- Update all related documentation

Technical Details:
- Python: operations/ module (filter.py, recode.py, binning.py, conditional.py, dropna.py, compute.py, pivot.py)
- Backend: QuickActionService.ts with 7 execute methods
- Frontend: 7 Dialog components with complete validation
- Toolbar: Enable 7 quick action buttons

Status: Phase 1-2 completed, basic testing passed, ready for further testing
This commit is contained in:
2025-12-08 17:38:08 +08:00
parent af325348b8
commit f729699510
158 changed files with 13814 additions and 273 deletions

View File

@@ -62,6 +62,15 @@ from services.docx_extractor import extract_docx_mammoth, validate_docx_file
from services.txt_extractor import extract_txt, validate_txt_file
from services.dc_executor import validate_code, execute_pandas_code
# ✨ 导入预写的数据操作函数
from operations.filter import apply_filter
from operations.recode import apply_recode
from operations.binning import apply_binning
from operations.conditional import apply_conditional_column, apply_simple_binning
from operations.dropna import drop_missing_values, get_missing_summary
from operations.compute import compute_column, get_formula_examples
from operations.pivot import pivot_long_to_wide, get_pivot_preview
# ==================== Pydantic Models ====================
@@ -74,6 +83,59 @@ class ExecuteCodeRequest(BaseModel):
data: List[Dict[str, Any]]
code: str
# ✨ 预写函数请求模型
class FilterRequest(BaseModel):
"""筛选请求模型"""
data: List[Dict[str, Any]]
conditions: List[Dict[str, Any]]
logic: str = 'and'
class RecodeRequest(BaseModel):
"""重编码请求模型"""
data: List[Dict[str, Any]]
column: str
mapping: Dict[Any, Any]
create_new_column: bool = True
new_column_name: str = None
class BinningRequest(BaseModel):
"""分箱请求模型"""
data: List[Dict[str, Any]]
column: str
method: str
new_column_name: str
bins: List[Any] = None
labels: List[Any] = None
num_bins: int = 3
class ConditionalRequest(BaseModel):
"""条件生成列请求模型"""
data: List[Dict[str, Any]]
new_column_name: str
rules: List[Dict[str, Any]]
else_value: Any = None
class DropnaRequest(BaseModel):
"""删除缺失值请求模型"""
data: List[Dict[str, Any]]
method: str # 'row', 'column', 'both'
threshold: float = 0.5
subset: List[str] = None
class ComputeRequest(BaseModel):
"""计算列请求模型"""
data: List[Dict[str, Any]]
new_column_name: str
formula: str
class PivotRequest(BaseModel):
"""Pivot请求模型"""
data: List[Dict[str, Any]]
index_column: str
pivot_column: str
value_columns: List[str]
aggfunc: str = 'first'
# ==================== API路由 ====================
@@ -592,6 +654,577 @@ async def execute_pandas_code_endpoint(request: ExecuteCodeRequest):
)
# ==================== ✨ 预写函数API端点 ====================
@app.post("/api/operations/filter")
async def operation_filter(request: FilterRequest):
"""
高级筛选操作(预写函数)
Args:
request: FilterRequest
- data: List[Dict] # 输入数据
- conditions: List[Dict] # 筛选条件
- logic: str # 'and''or'
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float,
"result_shape": [rows, cols]
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = apply_filter(df, request.conditions, request.logic)
# 转换回JSON处理NaN和inf值
import numpy as np
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"筛选成功: {len(request.data)}{len(result_data)}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"筛选操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time
})
@app.post("/api/operations/recode")
async def operation_recode(request: RecodeRequest):
"""
数值映射(重编码)操作(预写函数)
Args:
request: RecodeRequest
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = apply_recode(
df,
request.column,
request.mapping,
request.create_new_column,
request.new_column_name
)
# 转换回JSON处理NaN和inf值
import numpy as np
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"重编码成功: {request.column}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"重编码操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
})
@app.post("/api/operations/binning")
async def operation_binning(request: BinningRequest):
"""
生成分类变量(分箱)操作(预写函数)
Args:
request: BinningRequest
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = apply_binning(
df,
request.column,
request.method,
request.new_column_name,
request.bins,
request.labels,
request.num_bins
)
# 转换回JSON处理Categorical类型和NaN值
# 1. 将Categorical列转为字符串
for col in result_df.columns:
if pd.api.types.is_categorical_dtype(result_df[col]):
result_df[col] = result_df[col].astype(str)
# 2. 将NaN替换为None避免JSON序列化错误
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"分箱成功: {request.column}{request.new_column_name}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"分箱操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
})
@app.post("/api/operations/conditional")
async def operation_conditional(request: ConditionalRequest):
"""
条件生成列操作(预写函数)
根据多条件IF-THEN-ELSE规则生成新列
Args:
request: ConditionalRequest
- data: 数据
- new_column_name: 新列名称
- rules: 规则列表,每个规则包含 conditions, logic, result
- else_value: 默认值
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = apply_conditional_column(
df,
request.new_column_name,
request.rules,
request.else_value
)
# 转换回JSON处理NaN值
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"条件生成列成功: {request.new_column_name}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"条件生成列操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/dropna")
async def operation_dropna(request: DropnaRequest):
"""
删除缺失值操作(预写函数)
Args:
request: DropnaRequest
- data: 数据
- method: 删除方式 ('row', 'column', 'both')
- threshold: 缺失率阈值0-1
- subset: 仅检查指定列(可选)
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = drop_missing_values(
df,
method=request.method,
threshold=request.threshold,
subset=request.subset
)
# 转换回JSON处理NaN值
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"删除缺失值成功: {request.method}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"删除缺失值操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/compute")
async def operation_compute(request: ComputeRequest):
"""
计算列操作(预写函数)
基于公式计算新列
Args:
request: ComputeRequest
- data: 数据
- new_column_name: 新列名称
- formula: 计算公式
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = compute_column(
df,
request.new_column_name,
request.formula
)
# 转换回JSON处理NaN值和inf值
import numpy as np
# 1. 替换inf和-inf为None
result_df = result_df.replace([np.inf, -np.inf], None)
# 2. 替换NaN为None
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
# 3. 转换为dict此时已经没有NaN和inf
result_data = result_df.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"计算列成功: {request.new_column_name}")
# 使用json.dumps手动序列化处理NaN
import json
response_content = {
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
}
# 手动序列化NaN会被转为null
json_str = json.dumps(response_content, allow_nan=True)
# 替换NaN为null
json_str = json_str.replace('NaN', 'null').replace('Infinity', 'null').replace('-Infinity', 'null')
return JSONResponse(content=json.loads(json_str))
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"计算列操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/pivot")
async def operation_pivot(request: PivotRequest):
"""
Pivot操作长表转宽表预写函数
将纵向重复数据转为横向数据
Args:
request: PivotRequest
- data: 数据
- index_column: 索引列
- pivot_column: 透视列
- value_columns: 值列列表
- aggfunc: 聚合函数
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import numpy as np
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = pivot_long_to_wide(
df,
request.index_column,
request.pivot_column,
request.value_columns,
request.aggfunc
)
# 转换回JSON处理NaN和inf值
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"Pivot成功: {request.index_column} × {request.pivot_column}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"Pivot操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
# ==================== 启动配置 ====================
if __name__ == "__main__":