feat(dc): Complete Tool C quick action buttons Phase 1-2 - 7 functions

Summary:
- Implement 7 quick action functions (filter, recode, binning, conditional, dropna, compute, pivot)
- Refactor to pre-written Python functions architecture (stable and secure)
- Add 7 Python operations modules with full type hints
- Add 7 frontend Dialog components with user-friendly UI
- Fix NaN serialization issues and auto type conversion
- Update all related documentation

Technical Details:
- Python: operations/ module (filter.py, recode.py, binning.py, conditional.py, dropna.py, compute.py, pivot.py)
- Backend: QuickActionService.ts with 7 execute methods
- Frontend: 7 Dialog components with complete validation
- Toolbar: Enable 7 quick action buttons

Status: Phase 1-2 completed, basic testing passed, ready for further testing
This commit is contained in:
2025-12-08 17:38:08 +08:00
parent af325348b8
commit f729699510
158 changed files with 13814 additions and 273 deletions

View File

@@ -62,6 +62,15 @@ from services.docx_extractor import extract_docx_mammoth, validate_docx_file
from services.txt_extractor import extract_txt, validate_txt_file
from services.dc_executor import validate_code, execute_pandas_code
# ✨ 导入预写的数据操作函数
from operations.filter import apply_filter
from operations.recode import apply_recode
from operations.binning import apply_binning
from operations.conditional import apply_conditional_column, apply_simple_binning
from operations.dropna import drop_missing_values, get_missing_summary
from operations.compute import compute_column, get_formula_examples
from operations.pivot import pivot_long_to_wide, get_pivot_preview
# ==================== Pydantic Models ====================
@@ -74,6 +83,59 @@ class ExecuteCodeRequest(BaseModel):
data: List[Dict[str, Any]]
code: str
# ✨ 预写函数请求模型
class FilterRequest(BaseModel):
"""筛选请求模型"""
data: List[Dict[str, Any]]
conditions: List[Dict[str, Any]]
logic: str = 'and'
class RecodeRequest(BaseModel):
"""重编码请求模型"""
data: List[Dict[str, Any]]
column: str
mapping: Dict[Any, Any]
create_new_column: bool = True
new_column_name: str = None
class BinningRequest(BaseModel):
"""分箱请求模型"""
data: List[Dict[str, Any]]
column: str
method: str
new_column_name: str
bins: List[Any] = None
labels: List[Any] = None
num_bins: int = 3
class ConditionalRequest(BaseModel):
"""条件生成列请求模型"""
data: List[Dict[str, Any]]
new_column_name: str
rules: List[Dict[str, Any]]
else_value: Any = None
class DropnaRequest(BaseModel):
"""删除缺失值请求模型"""
data: List[Dict[str, Any]]
method: str # 'row', 'column', 'both'
threshold: float = 0.5
subset: List[str] = None
class ComputeRequest(BaseModel):
"""计算列请求模型"""
data: List[Dict[str, Any]]
new_column_name: str
formula: str
class PivotRequest(BaseModel):
"""Pivot请求模型"""
data: List[Dict[str, Any]]
index_column: str
pivot_column: str
value_columns: List[str]
aggfunc: str = 'first'
# ==================== API路由 ====================
@@ -592,6 +654,577 @@ async def execute_pandas_code_endpoint(request: ExecuteCodeRequest):
)
# ==================== ✨ 预写函数API端点 ====================
@app.post("/api/operations/filter")
async def operation_filter(request: FilterRequest):
"""
高级筛选操作(预写函数)
Args:
request: FilterRequest
- data: List[Dict] # 输入数据
- conditions: List[Dict] # 筛选条件
- logic: str # 'and''or'
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float,
"result_shape": [rows, cols]
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = apply_filter(df, request.conditions, request.logic)
# 转换回JSON处理NaN和inf值
import numpy as np
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"筛选成功: {len(request.data)}{len(result_data)}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"筛选操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time
})
@app.post("/api/operations/recode")
async def operation_recode(request: RecodeRequest):
"""
数值映射(重编码)操作(预写函数)
Args:
request: RecodeRequest
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = apply_recode(
df,
request.column,
request.mapping,
request.create_new_column,
request.new_column_name
)
# 转换回JSON处理NaN和inf值
import numpy as np
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"重编码成功: {request.column}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"重编码操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
})
@app.post("/api/operations/binning")
async def operation_binning(request: BinningRequest):
"""
生成分类变量(分箱)操作(预写函数)
Args:
request: BinningRequest
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = apply_binning(
df,
request.column,
request.method,
request.new_column_name,
request.bins,
request.labels,
request.num_bins
)
# 转换回JSON处理Categorical类型和NaN值
# 1. 将Categorical列转为字符串
for col in result_df.columns:
if pd.api.types.is_categorical_dtype(result_df[col]):
result_df[col] = result_df[col].astype(str)
# 2. 将NaN替换为None避免JSON序列化错误
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"分箱成功: {request.column}{request.new_column_name}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"分箱操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
})
@app.post("/api/operations/conditional")
async def operation_conditional(request: ConditionalRequest):
"""
条件生成列操作(预写函数)
根据多条件IF-THEN-ELSE规则生成新列
Args:
request: ConditionalRequest
- data: 数据
- new_column_name: 新列名称
- rules: 规则列表,每个规则包含 conditions, logic, result
- else_value: 默认值
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = apply_conditional_column(
df,
request.new_column_name,
request.rules,
request.else_value
)
# 转换回JSON处理NaN值
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"条件生成列成功: {request.new_column_name}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"条件生成列操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/dropna")
async def operation_dropna(request: DropnaRequest):
"""
删除缺失值操作(预写函数)
Args:
request: DropnaRequest
- data: 数据
- method: 删除方式 ('row', 'column', 'both')
- threshold: 缺失率阈值0-1
- subset: 仅检查指定列(可选)
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = drop_missing_values(
df,
method=request.method,
threshold=request.threshold,
subset=request.subset
)
# 转换回JSON处理NaN值
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"删除缺失值成功: {request.method}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"删除缺失值操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/compute")
async def operation_compute(request: ComputeRequest):
"""
计算列操作(预写函数)
基于公式计算新列
Args:
request: ComputeRequest
- data: 数据
- new_column_name: 新列名称
- formula: 计算公式
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = compute_column(
df,
request.new_column_name,
request.formula
)
# 转换回JSON处理NaN值和inf值
import numpy as np
# 1. 替换inf和-inf为None
result_df = result_df.replace([np.inf, -np.inf], None)
# 2. 替换NaN为None
result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
# 3. 转换为dict此时已经没有NaN和inf
result_data = result_df.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"计算列成功: {request.new_column_name}")
# 使用json.dumps手动序列化处理NaN
import json
response_content = {
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
}
# 手动序列化NaN会被转为null
json_str = json.dumps(response_content, allow_nan=True)
# 替换NaN为null
json_str = json_str.replace('NaN', 'null').replace('Infinity', 'null').replace('-Infinity', 'null')
return JSONResponse(content=json.loads(json_str))
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"计算列操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/pivot")
async def operation_pivot(request: PivotRequest):
"""
Pivot操作长表转宽表预写函数
将纵向重复数据转为横向数据
Args:
request: PivotRequest
- data: 数据
- index_column: 索引列
- pivot_column: 透视列
- value_columns: 值列列表
- aggfunc: 聚合函数
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float
}
"""
try:
import pandas as pd
import numpy as np
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用预写函数
result_df = pivot_long_to_wide(
df,
request.index_column,
request.pivot_column,
request.value_columns,
request.aggfunc
)
# 转换回JSON处理NaN和inf值
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"Pivot成功: {request.index_column} × {request.pivot_column}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"Pivot操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
# ==================== 启动配置 ====================
if __name__ == "__main__":

View File

@@ -0,0 +1,16 @@
"""
数据操作函数模块
提供预写的、经过测试的数据处理函数,供功能按钮调用。
模块列表:
- filter: 高级筛选
- recode: 数值映射(重编码)
- binning: 生成分类变量(分箱)
- conditional: 条件生成列
- missing: 缺失值处理
- duplicate: 去重
"""
__version__ = '1.0.0'

View File

@@ -0,0 +1,152 @@
"""
生成分类变量(分箱)操作
将连续数值变量转换为分类变量。
支持三种方法:自定义切点、等宽分箱、等频分箱。
"""
import pandas as pd
import numpy as np
from typing import List, Optional, Literal, Union
def apply_binning(
df: pd.DataFrame,
column: str,
method: Literal['custom', 'equal_width', 'equal_freq'],
new_column_name: str,
bins: Optional[List[Union[int, float]]] = None,
labels: Optional[List[Union[str, int]]] = None,
num_bins: int = 3
) -> pd.DataFrame:
"""
应用分箱操作
Args:
df: 输入数据框
column: 要分箱的列名
method: 分箱方法
- 'custom': 自定义切点
- 'equal_width': 等宽分箱
- 'equal_freq': 等频分箱
new_column_name: 新列名
bins: 自定义切点列表仅method='custom'时使用),如 [18, 60] → <18, 18-60, >60
labels: 标签列表(可选)
num_bins: 分组数量仅method='equal_width''equal_freq'时使用)
Returns:
分箱后的数据框
Examples:
>>> df = pd.DataFrame({'年龄': [15, 25, 35, 45, 55, 65, 75]})
>>> result = apply_binning(df, '年龄', 'custom', '年龄分组',
... bins=[18, 60], labels=['青少年', '成年', '老年'])
>>> result['年龄分组'].tolist()
['青少年', '成年', '成年', '成年', '成年', '老年', '老年']
"""
if df.empty:
return df
# 验证列是否存在
if column not in df.columns:
raise KeyError(f"'{column}' 不存在")
# 创建结果数据框
result = df.copy()
# 验证并转换数据类型
if not pd.api.types.is_numeric_dtype(result[column]):
# 尝试将字符串转换为数值
try:
result[column] = pd.to_numeric(result[column], errors='coerce')
print(f"警告: 列 '{column}' 已自动转换为数值类型")
except Exception as e:
raise TypeError(f"'{column}' 不是数值类型且无法转换,无法进行分箱")
# 检查是否有有效的数值
if result[column].isna().all():
raise ValueError(f"'{column}' 中没有有效的数值,无法进行分箱")
# 根据方法进行分箱
if method == 'custom':
# 自定义切点(用户输入的是中间切点,需要自动添加边界)
if not bins or len(bins) < 1:
raise ValueError('自定义切点至少需要1个值')
# 验证切点是否升序
if bins != sorted(bins):
raise ValueError('切点必须按升序排列')
# 自动添加左右边界
# 重要:始终添加边界,确保切点数+1=区间数
min_val = result[column].min()
max_val = result[column].max()
print(f'用户输入切点: {bins}')
print(f'数据范围: [{min_val:.2f}, {max_val:.2f}]')
# 构建完整的边界数组:始终添加左右边界
# 左边界取min(用户第一个切点, 数据最小值) - 0.001
# 右边界取max(用户最后一个切点, 数据最大值) + 0.001
left_bound = min(bins[0], min_val) - 0.001
right_bound = max(bins[-1], max_val) + 0.001
full_bins = [left_bound] + bins + [right_bound]
print(f'完整边界: {[f"{b:.1f}" for b in full_bins]}')
print(f'将生成 {len(full_bins) - 1} 个区间 = {len(bins) + 1} 个区间')
# 验证标签数量(区间数 = 边界数 - 1
expected_label_count = len(full_bins) - 1
if labels and len(labels) != expected_label_count:
raise ValueError(f'标签数量({len(labels)})必须等于区间数量({expected_label_count}')
result[new_column_name] = pd.cut(
result[column],
bins=full_bins,
labels=labels,
right=False,
include_lowest=True
)
elif method == 'equal_width':
# 等宽分箱
if num_bins < 2:
raise ValueError('分组数量至少为2')
result[new_column_name] = pd.cut(
result[column],
bins=num_bins,
labels=labels,
include_lowest=True
)
elif method == 'equal_freq':
# 等频分箱
if num_bins < 2:
raise ValueError('分组数量至少为2')
result[new_column_name] = pd.qcut(
result[column],
q=num_bins,
labels=labels,
duplicates='drop' # 处理重复边界值
)
else:
raise ValueError(f"不支持的分箱方法: {method}")
# 统计分布
print(f'分箱结果分布:')
value_counts = result[new_column_name].value_counts().sort_index()
for category, count in value_counts.items():
percentage = count / len(result) * 100
print(f' {category}: {count} 行 ({percentage:.1f}%)')
# 缺失值统计
missing_count = result[new_column_name].isna().sum()
if missing_count > 0:
print(f'警告: {missing_count} 个值无法分箱(可能是缺失值或边界问题)')
return result

View File

@@ -0,0 +1,227 @@
"""
计算列 - 预写函数
基于公式计算新列,支持数学运算和常用函数
"""
import pandas as pd
import numpy as np
import re
from typing import Dict, Any
# 允许的函数(安全白名单)
ALLOWED_FUNCTIONS = {
'abs': abs,
'round': round,
'sqrt': np.sqrt,
'log': np.log,
'log10': np.log10,
'exp': np.exp,
'sin': np.sin,
'cos': np.cos,
'tan': np.tan,
'floor': np.floor,
'ceil': np.ceil,
'min': min,
'max': max,
}
def validate_formula(formula: str, available_columns: list) -> tuple[bool, str]:
"""
验证公式安全性和正确性
Args:
formula: 公式字符串
available_columns: 可用的列名列表
Returns:
(is_valid, error_message)
"""
# 检查是否为空
if not formula or not formula.strip():
return False, '公式不能为空'
# 检查危险操作
dangerous_patterns = [
r'__', # 双下划线Python内部属性
r'import\s', # import语句
r'exec\s', # exec函数
r'eval\s', # eval函数
r'open\s*\(', # 文件操作
r'compile\s*\(', # 编译函数
r'globals\s*\(', # 全局变量
r'locals\s*\(', # 局部变量
r'__builtins__', # 内置函数
]
for pattern in dangerous_patterns:
if re.search(pattern, formula, re.IGNORECASE):
return False, f'公式包含不允许的操作: {pattern}'
# 检查是否只包含允许的字符
allowed_chars = r'[a-zA-Z0-9_\u4e00-\u9fa5\s\+\-\*/\(\)\.,\*\*]'
if not re.match(f'^{allowed_chars}+$', formula):
return False, '公式包含不允许的字符'
return True, ''
def compute_column(
df: pd.DataFrame,
new_column_name: str,
formula: str
) -> pd.DataFrame:
"""
基于公式计算新列
Args:
df: 输入数据框
new_column_name: 新列名称
formula: 计算公式
- 支持列名引用(如:身高, 体重)
- 支持运算符(+, -, *, /, **, %
- 支持函数abs, round, sqrt, log, exp等
Returns:
添加了新列的数据框
示例:
# BMI计算
compute_column(df, 'BMI', '体重 / (身高/100)**2')
# 年龄平方根
compute_column(df, '年龄_sqrt', 'sqrt(年龄)')
# 复杂公式
compute_column(df, '综合得分', '(FMA*0.6 + ADL*0.4) / 100')
"""
result = df.copy()
print(f'计算新列: {new_column_name}')
print(f'公式: {formula}')
print('')
# 验证公式
is_valid, error_msg = validate_formula(formula, list(result.columns))
if not is_valid:
raise ValueError(f'公式验证失败: {error_msg}')
# 准备执行环境
# 1. 添加数据框的列作为变量(自动转换数值类型)
env = {}
for col in result.columns:
# 尝试将列转换为数值类型
try:
# 如果列可以转换为数值,就转换
numeric_col = pd.to_numeric(result[col], errors='coerce')
# 如果转换后不全是NaN说明是数值列
if not numeric_col.isna().all():
env[col] = numeric_col
print(f'"{col}" 自动转换为数值类型')
else:
# 否则保持原样
env[col] = result[col]
except Exception:
# 转换失败,保持原样
env[col] = result[col]
# 2. 添加允许的函数
env.update(ALLOWED_FUNCTIONS)
# 3. 添加numpy用于数学运算
env['np'] = np
try:
# 执行公式计算
result[new_column_name] = eval(formula, {"__builtins__": {}}, env)
print(f'计算成功!')
print(f'新列类型: {result[new_column_name].dtype}')
print(f'新列前5个值:')
# 安全打印避免NaN/inf导致序列化错误
for idx, val in result[new_column_name].head().items():
if pd.isna(val):
print(f' [{idx}] None (NaN)')
elif np.isinf(val):
print(f' [{idx}] None (inf)')
else:
print(f' [{idx}] {val}')
print('')
# 统计结果
if pd.api.types.is_numeric_dtype(result[new_column_name]):
col_data = result[new_column_name]
# 统计缺失值和无效值
nan_count = col_data.isna().sum()
inf_count = np.isinf(col_data.replace([np.nan], 0)).sum()
print(f'统计信息:')
# 只对有效值计算统计量
valid_data = col_data.dropna().replace([np.inf, -np.inf], np.nan).dropna()
if len(valid_data) > 0:
print(f' 最小值: {valid_data.min():.2f}')
print(f' 最大值: {valid_data.max():.2f}')
print(f' 平均值: {valid_data.mean():.2f}')
else:
print(f' 没有有效的数值')
if nan_count > 0:
print(f' 缺失值(NaN): {nan_count}')
if inf_count > 0:
print(f' 无穷大值(inf): {inf_count}')
else:
print(f'非数值类型,跳过统计')
return result
except NameError as e:
# 列名不存在
missing_col = str(e).split("'")[1]
raise ValueError(f'"{missing_col}" 不存在,请检查公式中的列名')
except ZeroDivisionError:
raise ValueError('除零错误公式中存在除以0的情况')
except Exception as e:
raise ValueError(f'计算失败: {str(e)}')
def get_formula_examples() -> list[Dict[str, str]]:
"""
获取公式示例
Returns:
示例列表
"""
return [
{
'name': 'BMI计算',
'formula': '体重 / (身高/100)**2',
'description': '体重指数(需要身高(cm)和体重(kg)列)'
},
{
'name': '年龄分组',
'formula': 'round(年龄 / 10) * 10',
'description': '按10岁为一组20, 30, 40...'
},
{
'name': '综合得分',
'formula': '(FMA得分 * 0.6 + ADL得分 * 0.4)',
'description': '加权平均分'
},
{
'name': '变化率',
'formula': '(随访值 - 基线值) / 基线值 * 100',
'description': '计算变化百分比'
},
{
'name': '对数转换',
'formula': 'log(值 + 1)',
'description': '对数变换(处理偏态分布)'
},
]

View File

@@ -0,0 +1,188 @@
"""
条件生成列 - 预写函数
支持复杂的IF-THEN-ELSE多条件逻辑
"""
import pandas as pd
from typing import List, Dict, Any, Union
def apply_conditional_column(
df: pd.DataFrame,
new_column_name: str,
rules: List[Dict[str, Any]],
else_value: Any = None
) -> pd.DataFrame:
"""
根据多条件规则生成新列
Args:
df: 输入数据框
new_column_name: 新列名称
rules: 规则列表,每个规则包含:
- conditions: 条件列表
- logic: 'and''or'
- result: 满足条件时的结果值
else_value: 所有规则都不满足时的默认值
Returns:
添加了新列的数据框
示例:
rules = [
{
"conditions": [
{"column": "年龄", "operator": ">=", "value": 60}
],
"logic": "and",
"result": "老年"
},
{
"conditions": [
{"column": "年龄", "operator": ">=", "value": 18},
{"column": "年龄", "operator": "<", "value": 60}
],
"logic": "and",
"result": "成年"
}
]
"""
result = df.copy()
# 验证规则
if not rules or len(rules) == 0:
raise ValueError('至少需要1条规则')
# 验证所有引用的列是否存在
for rule in rules:
for condition in rule.get('conditions', []):
column = condition.get('column')
if column not in result.columns:
raise ValueError(f'"{column}" 不存在')
# 初始化新列为else_value
result[new_column_name] = else_value
print(f'开始应用条件规则,共 {len(rules)} 条规则')
# 按顺序应用每条规则
for rule_idx, rule in enumerate(rules, 1):
conditions = rule.get('conditions', [])
logic = rule.get('logic', 'and')
result_value = rule.get('result')
if not conditions:
continue
# 构建每个条件的mask
masks = []
for condition in conditions:
column = condition['column']
operator = condition['operator']
value = condition['value']
# 智能类型转换:对于数字比较运算符,尝试将列转换为数字
if operator in ('>', '<', '>=', '<='):
# 尝试将列转换为数字类型
try:
col_data = pd.to_numeric(result[column], errors='coerce')
# 确保value也是数字
if not isinstance(value, (int, float)):
value = float(value)
except Exception:
# 如果转换失败,使用原始数据
col_data = result[column]
else:
# 对于相等/不相等比较,使用原始数据
col_data = result[column]
# 根据运算符生成mask
if operator == '=':
mask = col_data == value
elif operator == '!=':
mask = col_data != value
elif operator == '>':
mask = col_data > value
elif operator == '<':
mask = col_data < value
elif operator == '>=':
mask = col_data >= value
elif operator == '<=':
mask = col_data <= value
else:
raise ValueError(f'不支持的运算符: {operator}')
masks.append(mask)
# 组合条件
if logic == 'and':
final_mask = pd.concat(masks, axis=1).all(axis=1)
elif logic == 'or':
final_mask = pd.concat(masks, axis=1).any(axis=1)
else:
raise ValueError(f'不支持的逻辑运算符: {logic}')
# 应用规则
matched_count = final_mask.sum()
result.loc[final_mask, new_column_name] = result_value
print(f' 规则{rule_idx}: 匹配 {matched_count} 行 → 值为 {result_value}')
# 统计结果分布
print(f'\n结果分布:')
value_counts = result[new_column_name].value_counts(dropna=False)
for value, count in value_counts.items():
percentage = count / len(result) * 100
if pd.isna(value):
print(f' (空值): {count} 行 ({percentage:.1f}%)')
else:
print(f' {value}: {count} 行 ({percentage:.1f}%)')
return result
def apply_simple_binning(
df: pd.DataFrame,
column: str,
new_column_name: str,
threshold: float,
value_if_true: Any = 1,
value_if_false: Any = 0
) -> pd.DataFrame:
"""
简单二分类(单一阈值判断)
这是条件生成列的简化版本,用于单一阈值判断
Args:
df: 输入数据框
column: 用于判断的列
new_column_name: 新列名称
threshold: 阈值
value_if_true: >= threshold时的值
value_if_false: < threshold时的值
Returns:
添加了新列的数据框
示例:
住院患者暴露分组:
督脉针刺持续时间 >= 10 → 1 (暴露)
督脉针刺持续时间 < 10 → 0 (非暴露)
"""
result = df.copy()
if column not in result.columns:
raise ValueError(f'"{column}" 不存在')
# 简单的阈值判断
result[new_column_name] = (result[column] >= threshold).astype(int) * value_if_true + \
(result[column] < threshold).astype(int) * value_if_false
# 统计分布
print(f'简单二分类结果:')
print(f' {column} >= {threshold}: {(result[column] >= threshold).sum()} 行 → {value_if_true}')
print(f' {column} < {threshold}: {(result[column] < threshold).sum()} 行 → {value_if_false}')
return result

View File

@@ -0,0 +1,149 @@
"""
删除缺失值 - 预写函数
支持按行删除、按列删除、阈值控制
"""
import pandas as pd
from typing import Literal, Optional, List
def drop_missing_values(
df: pd.DataFrame,
method: Literal['row', 'column', 'both'] = 'row',
threshold: Optional[float] = None,
subset: Optional[List[str]] = None
) -> pd.DataFrame:
"""
删除缺失值
Args:
df: 输入数据框
method: 删除方式
- 'row': 删除包含缺失值的行
- 'column': 删除缺失值过多的列
- 'both': 先删除列,再删除行
threshold: 缺失率阈值0-1之间仅对'column''both'有效
- 如果列的缺失率超过此阈值,则删除该列
- 默认为0.550%
subset: 仅检查指定列的缺失值(仅对'row'有效)
Returns:
删除缺失值后的数据框
示例:
# 删除包含任何缺失值的行
drop_missing_values(df, method='row')
# 删除缺失率>30%的列
drop_missing_values(df, method='column', threshold=0.3)
# 先删除缺失列,再删除缺失行
drop_missing_values(df, method='both', threshold=0.5)
# 仅检查指定列
drop_missing_values(df, method='row', subset=['年龄', 'BMI'])
"""
result = df.copy()
original_shape = result.shape
print(f'原始数据: {original_shape[0]}× {original_shape[1]}')
print(f'缺失值总数: {result.isna().sum().sum()}')
print('')
# 默认阈值
if threshold is None:
threshold = 0.5
# 按列删除
if method in ('column', 'both'):
# 计算每列的缺失率
missing_rate = result.isna().sum() / len(result)
cols_to_drop = missing_rate[missing_rate > threshold].index.tolist()
if cols_to_drop:
print(f'检测到缺失率>{threshold*100:.0f}%的列: {len(cols_to_drop)}')
for col in cols_to_drop:
rate = missing_rate[col]
count = result[col].isna().sum()
print(f' - {col}: 缺失率={rate*100:.1f}% ({count}/{len(result)})')
result = result.drop(columns=cols_to_drop)
print(f'删除后: {result.shape[0]}× {result.shape[1]}')
print('')
else:
print(f'没有找到缺失率>{threshold*100:.0f}%的列')
print('')
# 按行删除
if method in ('row', 'both'):
before_rows = len(result)
if subset:
# 仅检查指定列
print(f'仅检查指定列的缺失值: {subset}')
result = result.dropna(subset=subset)
else:
# 检查所有列
result = result.dropna()
dropped_rows = before_rows - len(result)
if dropped_rows > 0:
print(f'删除了 {dropped_rows} 行(包含缺失值的行)')
print(f'保留了 {len(result)} 行({len(result)/before_rows*100:.1f}%')
else:
print('没有找到包含缺失值的行')
print('')
# 最终统计
final_shape = result.shape
print(f'最终结果: {final_shape[0]}× {final_shape[1]}')
print(f'删除了 {original_shape[0] - final_shape[0]}')
print(f'删除了 {original_shape[1] - final_shape[1]}')
print(f'剩余缺失值: {result.isna().sum().sum()}')
# 如果结果为空,给出警告
if len(result) == 0:
print('\n⚠️ 警告: 删除后数据为空!')
return result
def get_missing_summary(df: pd.DataFrame) -> dict:
"""
获取缺失值统计摘要
Args:
df: 输入数据框
Returns:
缺失值统计信息
"""
total_cells = df.shape[0] * df.shape[1]
total_missing = df.isna().sum().sum()
# 按列统计
col_missing = df.isna().sum()
col_missing_rate = col_missing / len(df)
cols_with_missing = col_missing[col_missing > 0].to_dict()
cols_missing_rate = col_missing_rate[col_missing > 0].to_dict()
# 按行统计
row_missing = df.isna().sum(axis=1)
rows_with_missing = (row_missing > 0).sum()
return {
'total_cells': total_cells,
'total_missing': int(total_missing),
'missing_rate': total_missing / total_cells if total_cells > 0 else 0,
'rows_with_missing': int(rows_with_missing),
'cols_with_missing': len(cols_with_missing),
'col_missing_detail': {
col: {
'count': int(count),
'rate': float(cols_missing_rate[col])
}
for col, count in cols_with_missing.items()
}
}

View File

@@ -0,0 +1,109 @@
"""
高级筛选操作
提供多条件筛选功能支持AND/OR逻辑组合。
"""
import pandas as pd
from typing import List, Dict, Any, Literal
def apply_filter(
df: pd.DataFrame,
conditions: List[Dict[str, Any]],
logic: Literal['and', 'or'] = 'and'
) -> pd.DataFrame:
"""
应用筛选条件
Args:
df: 输入数据框
conditions: 筛选条件列表,每个条件包含:
- column: 列名
- operator: 运算符 (=, !=, >, <, >=, <=, contains, not_contains,
starts_with, ends_with, is_null, not_null)
- value: 值is_null和not_null不需要
logic: 逻辑组合方式 ('and''or')
Returns:
筛选后的数据框
Examples:
>>> df = pd.DataFrame({'年龄': [25, 35, 45], '性别': ['', '', '']})
>>> conditions = [
... {'column': '年龄', 'operator': '>', 'value': 30},
... {'column': '性别', 'operator': '=', 'value': ''}
... ]
>>> result = apply_filter(df, conditions, logic='and')
>>> len(result)
1
"""
if not conditions:
raise ValueError('筛选条件不能为空')
if df.empty:
return df
# 生成各个条件的mask
masks = []
for cond in conditions:
column = cond['column']
operator = cond['operator']
value = cond.get('value')
# 验证列是否存在
if column not in df.columns:
raise KeyError(f"'{column}' 不存在")
# 根据运算符生成mask
if operator == '=':
mask = df[column] == value
elif operator == '!=':
mask = df[column] != value
elif operator == '>':
mask = df[column] > value
elif operator == '<':
mask = df[column] < value
elif operator == '>=':
mask = df[column] >= value
elif operator == '<=':
mask = df[column] <= value
elif operator == 'contains':
mask = df[column].astype(str).str.contains(str(value), na=False)
elif operator == 'not_contains':
mask = ~df[column].astype(str).str.contains(str(value), na=False)
elif operator == 'starts_with':
mask = df[column].astype(str).str.startswith(str(value), na=False)
elif operator == 'ends_with':
mask = df[column].astype(str).str.endswith(str(value), na=False)
elif operator == 'is_null':
mask = df[column].isna()
elif operator == 'not_null':
mask = df[column].notna()
else:
raise ValueError(f"不支持的运算符: {operator}")
masks.append(mask)
# 组合所有条件
if logic == 'and':
final_mask = pd.concat(masks, axis=1).all(axis=1)
elif logic == 'or':
final_mask = pd.concat(masks, axis=1).any(axis=1)
else:
raise ValueError(f"不支持的逻辑运算: {logic}")
# 应用筛选
result = df[final_mask].copy()
# 打印统计信息
original_rows = len(df)
filtered_rows = len(result)
removed_rows = original_rows - filtered_rows
print(f'原始数据: {original_rows}')
print(f'筛选后: {filtered_rows}')
print(f'删除: {removed_rows} 行 ({removed_rows/original_rows*100:.1f}%)')
return result

View File

@@ -0,0 +1,161 @@
"""
Pivot操作 - 预写函数
长表转宽表(一人多行 → 一人一行)
"""
import pandas as pd
from typing import List, Literal, Optional
def pivot_long_to_wide(
df: pd.DataFrame,
index_column: str,
pivot_column: str,
value_columns: List[str],
aggfunc: Literal['first', 'last', 'mean', 'sum', 'min', 'max'] = 'first'
) -> pd.DataFrame:
"""
长表转宽表Pivot
将纵向重复的数据转为横向数据
Args:
df: 输入数据框
index_column: 索引列(唯一标识,如 Record ID
pivot_column: 透视列(将变成新列名的列,如 Event Name
value_columns: 值列(要转置的数据列,如 FMA得分, ADL得分
aggfunc: 聚合函数
- 'first': 取第一个值(推荐)
- 'last': 取最后一个值
- 'mean': 求平均值
- 'sum': 求和
- 'min': 取最小值
- 'max': 取最大值
Returns:
宽表数据框
示例:
pivot_long_to_wide(
df,
index_column='Record ID',
pivot_column='Event Name',
value_columns=['FMA得分', 'ADL得分'],
aggfunc='first'
)
"""
result = df.copy()
print(f'原始数据: {len(result)}× {len(result.columns)}')
print(f'索引列: {index_column}')
print(f'透视列: {pivot_column}')
print(f'值列: {", ".join(value_columns)}')
print(f'聚合方式: {aggfunc}')
print('')
# 验证列是否存在
required_cols = [index_column, pivot_column] + value_columns
missing_cols = [col for col in required_cols if col not in result.columns]
if missing_cols:
raise ValueError(f'以下列不存在: {", ".join(missing_cols)}')
# 检查索引列的唯一值数量
unique_index = result[index_column].nunique()
print(f'唯一{index_column}数量: {unique_index}')
# 检查透视列的唯一值
unique_pivot = result[pivot_column].unique()
print(f'透视列"{pivot_column}"的唯一值: {list(unique_pivot)}')
print('')
try:
# 执行Pivot转换
df_pivot = result.pivot_table(
index=index_column,
columns=pivot_column,
values=value_columns,
aggfunc=aggfunc
)
# 展平多级列名
# 如果只有一个值列,列名是单层的
if len(value_columns) == 1:
df_pivot.columns = [f'{value_columns[0]}_{col}' for col in df_pivot.columns]
else:
# 多个值列,列名是多层的,需要展平
df_pivot.columns = ['_'.join(str(c) for c in col).strip() for col in df_pivot.columns.values]
# 重置索引将index列变回普通列
df_pivot = df_pivot.reset_index()
print(f'转换成功!')
print(f'结果: {len(df_pivot)}× {len(df_pivot.columns)}')
print(f'新增列: {len(df_pivot.columns) - 1}')
print('')
# 显示新列名
print(f'生成的列名:')
new_cols = [col for col in df_pivot.columns if col != index_column]
for i, col in enumerate(new_cols[:10], 1): # 只显示前10个
print(f' {i}. {col}')
if len(new_cols) > 10:
print(f' ... 还有 {len(new_cols) - 10}')
return df_pivot
except ValueError as e:
# Pivot失败可能有重复的index+pivot组合
if 'Index contains duplicate entries' in str(e):
# 统计重复情况
duplicates = result.groupby([index_column, pivot_column]).size()
duplicates = duplicates[duplicates > 1]
print('⚠️ 警告: 发现重复的索引+透视组合:')
for (idx, piv), count in duplicates.head(5).items():
print(f' {index_column}={idx}, {pivot_column}={piv}: {count}')
if len(duplicates) > 5:
print(f' ... 还有 {len(duplicates) - 5} 个重复组合')
print(f'\n建议: 使用聚合函数如mean、sum处理重复值')
print(f'当前聚合方式: {aggfunc}')
raise ValueError(f'存在重复的{index_column}+{pivot_column}组合,需要选择合适的聚合方式')
else:
raise e
def get_pivot_preview(
df: pd.DataFrame,
index_column: str,
pivot_column: str
) -> dict:
"""
获取Pivot预览信息
Args:
df: 输入数据框
index_column: 索引列
pivot_column: 透视列
Returns:
预览信息
"""
# 统计唯一值
unique_index = df[index_column].nunique()
unique_pivot = df[pivot_column].unique()
# 检查是否有重复
duplicates = df.groupby([index_column, pivot_column]).size()
has_duplicates = (duplicates > 1).any()
duplicate_count = (duplicates > 1).sum() if has_duplicates else 0
return {
'unique_index_count': int(unique_index),
'unique_pivot_values': [str(v) for v in unique_pivot],
'has_duplicates': bool(has_duplicates),
'duplicate_count': int(duplicate_count),
'estimated_rows': int(unique_index),
'estimated_columns': len(unique_pivot)
}

View File

@@ -0,0 +1,79 @@
"""
数值映射(重编码)操作
将分类变量的原始值映射为新值男→1女→2
"""
import pandas as pd
from typing import Dict, Any, Optional
def apply_recode(
df: pd.DataFrame,
column: str,
mapping: Dict[Any, Any],
create_new_column: bool = True,
new_column_name: Optional[str] = None
) -> pd.DataFrame:
"""
应用数值映射
Args:
df: 输入数据框
column: 要重编码的列名
mapping: 映射字典,如 {'': 1, '': 2}
create_new_column: 是否创建新列True或覆盖原列False
new_column_name: 新列名create_new_column=True时使用
Returns:
重编码后的数据框
Examples:
>>> df = pd.DataFrame({'性别': ['', '', '', '']})
>>> mapping = {'': 1, '': 2}
>>> result = apply_recode(df, '性别', mapping, True, '性别_编码')
>>> result['性别_编码'].tolist()
[1, 2, 1, 2]
"""
if df.empty:
return df
# 验证列是否存在
if column not in df.columns:
raise KeyError(f"'{column}' 不存在")
if not mapping:
raise ValueError('映射字典不能为空')
# 确定目标列名
if create_new_column:
target_column = new_column_name or f'{column}_编码'
else:
target_column = column
# 创建结果数据框(避免修改原数据)
result = df.copy()
# 应用映射
result[target_column] = result[column].map(mapping)
# 统计结果
mapped_count = result[target_column].notna().sum()
unmapped_count = result[target_column].isna().sum()
total_count = len(result)
print(f'映射完成: {mapped_count} 个值成功映射')
if unmapped_count > 0:
print(f'警告: {unmapped_count} 个值未找到对应映射')
# 找出未映射的唯一值
unmapped_mask = result[target_column].isna()
unmapped_values = result.loc[unmapped_mask, column].unique()
print(f'未映射的值: {list(unmapped_values)[:10]}') # 最多显示10个
# 映射成功率
success_rate = (mapped_count / total_count * 100) if total_count > 0 else 0
print(f'映射成功率: {success_rate:.1f}%')
return result

View File

@@ -279,3 +279,5 @@ if __name__ == "__main__":
main()

View File

@@ -45,3 +45,5 @@ except Exception as e:
print(f"\n❌ 测试异常: {str(e)}")

View File

@@ -25,3 +25,5 @@ except Exception as e:
traceback.print_exc()