feat(dc): Complete Tool C quick action buttons Phase 1-2 - 7 functions

Summary: - Implement 7 quick action functions (filter, recode, binning, conditional, dropna, compute, pivot) - Refactor to pre-written Python functions architecture (stable and secure) - Add 7 Python operations modules with full type hints - Add 7 frontend Dialog components with user-friendly UI - Fix NaN serialization issues and auto type conversion - Update all related documentation Technical Details: - Python: operations/ module (filter.py, recode.py, binning.py, conditional.py, dropna.py, compute.py, pivot.py) - Backend: QuickActionService.ts with 7 execute methods - Frontend: 7 Dialog components with complete validation - Toolbar: Enable 7 quick action buttons Status: Phase 1-2 completed, basic testing passed, ready for further testing
2025-12-08 17:38:08 +08:00
parent af325348b8
commit f729699510
158 changed files with 13814 additions and 273 deletions
--- a/extraction_service/main.py
+++ b/extraction_service/main.py
@@ -62,6 +62,15 @@ from services.docx_extractor import extract_docx_mammoth, validate_docx_file
 from services.txt_extractor import extract_txt, validate_txt_file
 from services.dc_executor import validate_code, execute_pandas_code

+# ✨ 导入预写的数据操作函数
+from operations.filter import apply_filter
+from operations.recode import apply_recode
+from operations.binning import apply_binning
+from operations.conditional import apply_conditional_column, apply_simple_binning
+from operations.dropna import drop_missing_values, get_missing_summary
+from operations.compute import compute_column, get_formula_examples
+from operations.pivot import pivot_long_to_wide, get_pivot_preview
+

 # ==================== Pydantic Models ====================

@@ -74,6 +83,59 @@ class ExecuteCodeRequest(BaseModel):
    data: List[Dict[str, Any]]
    code: str

+# ✨ 预写函数请求模型
+class FilterRequest(BaseModel):
+    """筛选请求模型"""
+    data: List[Dict[str, Any]]
+    conditions: List[Dict[str, Any]]
+    logic: str = 'and'
+
+class RecodeRequest(BaseModel):
+    """重编码请求模型"""
+    data: List[Dict[str, Any]]
+    column: str
+    mapping: Dict[Any, Any]
+    create_new_column: bool = True
+    new_column_name: str = None
+
+class BinningRequest(BaseModel):
+    """分箱请求模型"""
+    data: List[Dict[str, Any]]
+    column: str
+    method: str
+    new_column_name: str
+    bins: List[Any] = None
+    labels: List[Any] = None
+    num_bins: int = 3
+
+class ConditionalRequest(BaseModel):
+    """条件生成列请求模型"""
+    data: List[Dict[str, Any]]
+    new_column_name: str
+    rules: List[Dict[str, Any]]
+    else_value: Any = None
+
+class DropnaRequest(BaseModel):
+    """删除缺失值请求模型"""
+    data: List[Dict[str, Any]]
+    method: str  # 'row', 'column', 'both'
+    threshold: float = 0.5
+    subset: List[str] = None
+
+class ComputeRequest(BaseModel):
+    """计算列请求模型"""
+    data: List[Dict[str, Any]]
+    new_column_name: str
+    formula: str
+
+class PivotRequest(BaseModel):
+    """Pivot请求模型"""
+    data: List[Dict[str, Any]]
+    index_column: str
+    pivot_column: str
+    value_columns: List[str]
+    aggfunc: str = 'first'
+

 # ==================== API路由 ====================

@@ -592,6 +654,577 @@ async def execute_pandas_code_endpoint(request: ExecuteCodeRequest):
        )


+# ==================== ✨ 预写函数API端点 ====================
+
+@app.post("/api/operations/filter")
+async def operation_filter(request: FilterRequest):
+    """
+    高级筛选操作（预写函数）
+    
+    Args:
+        request: FilterRequest
+            - data: List[Dict]  # 输入数据
+            - conditions: List[Dict]  # 筛选条件
+            - logic: str  # 'and' 或 'or'
+    
+    Returns:
+        {
+            "success": bool,
+            "result_data": List[Dict],
+            "output": str,
+            "execution_time": float,
+            "result_shape": [rows, cols]
+        }
+    """
+    try:
+        import pandas as pd
+        import time
+        import io
+        import sys
+        
+        start_time = time.time()
+        
+        # 捕获打印输出
+        captured_output = io.StringIO()
+        sys.stdout = captured_output
+        
+        try:
+            # 转换为DataFrame
+            df = pd.DataFrame(request.data)
+            
+            # 调用预写函数
+            result_df = apply_filter(df, request.conditions, request.logic)
+            
+            # 转换回JSON（处理NaN和inf值）
+            import numpy as np
+            result_df = result_df.replace([np.inf, -np.inf], None)
+            result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
+            result_data = result_df_clean.to_dict('records')
+            
+            # 恢复stdout
+            sys.stdout = sys.__stdout__
+            output = captured_output.getvalue()
+            
+            execution_time = time.time() - start_time
+            
+            logger.info(f"筛选成功: {len(request.data)} → {len(result_data)} 行")
+            
+            return JSONResponse(content={
+                "success": True,
+                "result_data": result_data,
+                "output": output,
+                "execution_time": execution_time,
+                "result_shape": [len(result_data), len(result_df.columns)]
+            })
+            
+        except Exception as e:
+            sys.stdout = sys.__stdout__
+            output = captured_output.getvalue()
+            raise e
+            
+    except Exception as e:
+        logger.error(f"筛选操作失败: {str(e)}")
+        return JSONResponse(content={
+            "success": False,
+            "error": str(e),
+            "execution_time": time.time() - start_time
+        })
+
+
+@app.post("/api/operations/recode")
+async def operation_recode(request: RecodeRequest):
+    """
+    数值映射（重编码）操作（预写函数）
+    
+    Args:
+        request: RecodeRequest
+    
+    Returns:
+        {
+            "success": bool,
+            "result_data": List[Dict],
+            "output": str,
+            "execution_time": float
+        }
+    """
+    try:
+        import pandas as pd
+        import time
+        import io
+        import sys
+        
+        start_time = time.time()
+        
+        # 捕获打印输出
+        captured_output = io.StringIO()
+        sys.stdout = captured_output
+        
+        try:
+            # 转换为DataFrame
+            df = pd.DataFrame(request.data)
+            
+            # 调用预写函数
+            result_df = apply_recode(
+                df,
+                request.column,
+                request.mapping,
+                request.create_new_column,
+                request.new_column_name
+            )
+            
+            # 转换回JSON（处理NaN和inf值）
+            import numpy as np
+            result_df = result_df.replace([np.inf, -np.inf], None)
+            result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
+            result_data = result_df_clean.to_dict('records')
+            
+            # 恢复stdout
+            sys.stdout = sys.__stdout__
+            output = captured_output.getvalue()
+            
+            execution_time = time.time() - start_time
+            
+            logger.info(f"重编码成功: {request.column}")
+            
+            return JSONResponse(content={
+                "success": True,
+                "result_data": result_data,
+                "output": output,
+                "execution_time": execution_time,
+                "result_shape": [len(result_data), len(result_df.columns)]
+            })
+            
+        except Exception as e:
+            sys.stdout = sys.__stdout__
+            output = captured_output.getvalue()
+            raise e
+            
+    except Exception as e:
+        logger.error(f"重编码操作失败: {str(e)}")
+        return JSONResponse(content={
+            "success": False,
+            "error": str(e),
+            "execution_time": time.time() - start_time if 'start_time' in locals() else 0
+        })
+
+
+@app.post("/api/operations/binning")
+async def operation_binning(request: BinningRequest):
+    """
+    生成分类变量（分箱）操作（预写函数）
+    
+    Args:
+        request: BinningRequest
+    
+    Returns:
+        {
+            "success": bool,
+            "result_data": List[Dict],
+            "output": str,
+            "execution_time": float
+        }
+    """
+    try:
+        import pandas as pd
+        import time
+        import io
+        import sys
+        
+        start_time = time.time()
+        
+        # 捕获打印输出
+        captured_output = io.StringIO()
+        sys.stdout = captured_output
+        
+        try:
+            # 转换为DataFrame
+            df = pd.DataFrame(request.data)
+            
+            # 调用预写函数
+            result_df = apply_binning(
+                df,
+                request.column,
+                request.method,
+                request.new_column_name,
+                request.bins,
+                request.labels,
+                request.num_bins
+            )
+            
+            # 转换回JSON（处理Categorical类型和NaN值）
+            # 1. 将Categorical列转为字符串
+            for col in result_df.columns:
+                if pd.api.types.is_categorical_dtype(result_df[col]):
+                    result_df[col] = result_df[col].astype(str)
+            
+            # 2. 将NaN替换为None（避免JSON序列化错误）
+            result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
+            result_data = result_df_clean.to_dict('records')
+            
+            # 恢复stdout
+            sys.stdout = sys.__stdout__
+            output = captured_output.getvalue()
+            
+            execution_time = time.time() - start_time
+            
+            logger.info(f"分箱成功: {request.column} → {request.new_column_name}")
+            
+            return JSONResponse(content={
+                "success": True,
+                "result_data": result_data,
+                "output": output,
+                "execution_time": execution_time,
+                "result_shape": [len(result_data), len(result_df.columns)]
+            })
+            
+        except Exception as e:
+            sys.stdout = sys.__stdout__
+            output = captured_output.getvalue()
+            raise e
+            
+    except Exception as e:
+        logger.error(f"分箱操作失败: {str(e)}")
+        return JSONResponse(content={
+            "success": False,
+            "error": str(e),
+            "execution_time": time.time() - start_time if 'start_time' in locals() else 0
+        })
+
+
+@app.post("/api/operations/conditional")
+async def operation_conditional(request: ConditionalRequest):
+    """
+    条件生成列操作（预写函数）
+    
+    根据多条件IF-THEN-ELSE规则生成新列
+    
+    Args:
+        request: ConditionalRequest
+            - data: 数据
+            - new_column_name: 新列名称
+            - rules: 规则列表，每个规则包含 conditions, logic, result
+            - else_value: 默认值
+    
+    Returns:
+        {
+            "success": bool,
+            "result_data": List[Dict],
+            "output": str,
+            "execution_time": float
+        }
+    """
+    try:
+        import pandas as pd
+        import time
+        import io
+        import sys
+        
+        start_time = time.time()
+        
+        # 捕获打印输出
+        captured_output = io.StringIO()
+        sys.stdout = captured_output
+        
+        try:
+            # 转换为DataFrame
+            df = pd.DataFrame(request.data)
+            
+            # 调用预写函数
+            result_df = apply_conditional_column(
+                df,
+                request.new_column_name,
+                request.rules,
+                request.else_value
+            )
+            
+            # 转换回JSON（处理NaN值）
+            result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
+            result_data = result_df_clean.to_dict('records')
+            
+            # 恢复stdout
+            sys.stdout = sys.__stdout__
+            output = captured_output.getvalue()
+            
+            execution_time = time.time() - start_time
+            
+            logger.info(f"条件生成列成功: {request.new_column_name}")
+            
+            return JSONResponse(content={
+                "success": True,
+                "result_data": result_data,
+                "output": output,
+                "execution_time": execution_time,
+                "result_shape": [len(result_data), len(result_df.columns)]
+            })
+            
+        except Exception as e:
+            sys.stdout = sys.__stdout__
+            output = captured_output.getvalue()
+            raise e
+            
+    except Exception as e:
+        logger.error(f"条件生成列操作失败: {str(e)}")
+        return JSONResponse(content={
+            "success": False,
+            "error": str(e),
+            "execution_time": time.time() - start_time if 'start_time' in locals() else 0
+        }, status_code=400)
+
+
+@app.post("/api/operations/dropna")
+async def operation_dropna(request: DropnaRequest):
+    """
+    删除缺失值操作（预写函数）
+    
+    Args:
+        request: DropnaRequest
+            - data: 数据
+            - method: 删除方式 ('row', 'column', 'both')
+            - threshold: 缺失率阈值（0-1）
+            - subset: 仅检查指定列（可选）
+    
+    Returns:
+        {
+            "success": bool,
+            "result_data": List[Dict],
+            "output": str,
+            "execution_time": float
+        }
+    """
+    try:
+        import pandas as pd
+        import time
+        import io
+        import sys
+        
+        start_time = time.time()
+        
+        # 捕获打印输出
+        captured_output = io.StringIO()
+        sys.stdout = captured_output
+        
+        try:
+            # 转换为DataFrame
+            df = pd.DataFrame(request.data)
+            
+            # 调用预写函数
+            result_df = drop_missing_values(
+                df,
+                method=request.method,
+                threshold=request.threshold,
+                subset=request.subset
+            )
+            
+            # 转换回JSON（处理NaN值）
+            result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
+            result_data = result_df_clean.to_dict('records')
+            
+            # 恢复stdout
+            sys.stdout = sys.__stdout__
+            output = captured_output.getvalue()
+            
+            execution_time = time.time() - start_time
+            
+            logger.info(f"删除缺失值成功: {request.method}")
+            
+            return JSONResponse(content={
+                "success": True,
+                "result_data": result_data,
+                "output": output,
+                "execution_time": execution_time,
+                "result_shape": [len(result_data), len(result_df.columns)]
+            })
+            
+        except Exception as e:
+            sys.stdout = sys.__stdout__
+            output = captured_output.getvalue()
+            raise e
+            
+    except Exception as e:
+        logger.error(f"删除缺失值操作失败: {str(e)}")
+        return JSONResponse(content={
+            "success": False,
+            "error": str(e),
+            "execution_time": time.time() - start_time if 'start_time' in locals() else 0
+        }, status_code=400)
+
+
+@app.post("/api/operations/compute")
+async def operation_compute(request: ComputeRequest):
+    """
+    计算列操作（预写函数）
+    
+    基于公式计算新列
+    
+    Args:
+        request: ComputeRequest
+            - data: 数据
+            - new_column_name: 新列名称
+            - formula: 计算公式
+    
+    Returns:
+        {
+            "success": bool,
+            "result_data": List[Dict],
+            "output": str,
+            "execution_time": float
+        }
+    """
+    try:
+        import pandas as pd
+        import time
+        import io
+        import sys
+        
+        start_time = time.time()
+        
+        # 捕获打印输出
+        captured_output = io.StringIO()
+        sys.stdout = captured_output
+        
+        try:
+            # 转换为DataFrame
+            df = pd.DataFrame(request.data)
+            
+            # 调用预写函数
+            result_df = compute_column(
+                df,
+                request.new_column_name,
+                request.formula
+            )
+            
+            # 转换回JSON（处理NaN值和inf值）
+            import numpy as np
+            # 1. 替换inf和-inf为None
+            result_df = result_df.replace([np.inf, -np.inf], None)
+            # 2. 替换NaN为None
+            result_df = result_df.fillna(value=pd.NA).replace({pd.NA: None})
+            # 3. 转换为dict（此时已经没有NaN和inf）
+            result_data = result_df.to_dict('records')
+            
+            # 恢复stdout
+            sys.stdout = sys.__stdout__
+            output = captured_output.getvalue()
+            
+            execution_time = time.time() - start_time
+            
+            logger.info(f"计算列成功: {request.new_column_name}")
+            
+            # 使用json.dumps手动序列化（处理NaN）
+            import json
+            response_content = {
+                "success": True,
+                "result_data": result_data,
+                "output": output,
+                "execution_time": execution_time,
+                "result_shape": [len(result_data), len(result_df.columns)]
+            }
+            
+            # 手动序列化，NaN会被转为null
+            json_str = json.dumps(response_content, allow_nan=True)
+            # 替换NaN为null
+            json_str = json_str.replace('NaN', 'null').replace('Infinity', 'null').replace('-Infinity', 'null')
+            
+            return JSONResponse(content=json.loads(json_str))
+            
+        except Exception as e:
+            sys.stdout = sys.__stdout__
+            output = captured_output.getvalue()
+            raise e
+            
+    except Exception as e:
+        logger.error(f"计算列操作失败: {str(e)}")
+        return JSONResponse(content={
+            "success": False,
+            "error": str(e),
+            "execution_time": time.time() - start_time if 'start_time' in locals() else 0
+        }, status_code=400)
+
+
+@app.post("/api/operations/pivot")
+async def operation_pivot(request: PivotRequest):
+    """
+    Pivot操作：长表转宽表（预写函数）
+    
+    将纵向重复数据转为横向数据
+    
+    Args:
+        request: PivotRequest
+            - data: 数据
+            - index_column: 索引列
+            - pivot_column: 透视列
+            - value_columns: 值列列表
+            - aggfunc: 聚合函数
+    
+    Returns:
+        {
+            "success": bool,
+            "result_data": List[Dict],
+            "output": str,
+            "execution_time": float
+        }
+    """
+    try:
+        import pandas as pd
+        import numpy as np
+        import time
+        import io
+        import sys
+        
+        start_time = time.time()
+        
+        # 捕获打印输出
+        captured_output = io.StringIO()
+        sys.stdout = captured_output
+        
+        try:
+            # 转换为DataFrame
+            df = pd.DataFrame(request.data)
+            
+            # 调用预写函数
+            result_df = pivot_long_to_wide(
+                df,
+                request.index_column,
+                request.pivot_column,
+                request.value_columns,
+                request.aggfunc
+            )
+            
+            # 转换回JSON（处理NaN和inf值）
+            result_df = result_df.replace([np.inf, -np.inf], None)
+            result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
+            result_data = result_df_clean.to_dict('records')
+            
+            # 恢复stdout
+            sys.stdout = sys.__stdout__
+            output = captured_output.getvalue()
+            
+            execution_time = time.time() - start_time
+            
+            logger.info(f"Pivot成功: {request.index_column} × {request.pivot_column}")
+            
+            return JSONResponse(content={
+                "success": True,
+                "result_data": result_data,
+                "output": output,
+                "execution_time": execution_time,
+                "result_shape": [len(result_data), len(result_df.columns)]
+            })
+            
+        except Exception as e:
+            sys.stdout = sys.__stdout__
+            output = captured_output.getvalue()
+            raise e
+            
+    except Exception as e:
+        logger.error(f"Pivot操作失败: {str(e)}")
+        return JSONResponse(content={
+            "success": False,
+            "error": str(e),
+            "execution_time": time.time() - start_time if 'start_time' in locals() else 0
+        }, status_code=400)
+
+
 # ==================== 启动配置 ====================

 if __name__ == "__main__":
--- a/extraction_service/operations/init.py
+++ b/extraction_service/operations/init.py
@@ -0,0 +1,16 @@
+"""
+数据操作函数模块
+
+提供预写的、经过测试的数据处理函数，供功能按钮调用。
+
+模块列表：
+- filter: 高级筛选
+- recode: 数值映射（重编码）
+- binning: 生成分类变量（分箱）
+- conditional: 条件生成列
+- missing: 缺失值处理
+- duplicate: 去重
+"""
+
+__version__ = '1.0.0'
+
--- a/extraction_service/operations/binning.py
+++ b/extraction_service/operations/binning.py
@@ -0,0 +1,152 @@
+"""
+生成分类变量（分箱）操作
+
+将连续数值变量转换为分类变量。
+支持三种方法：自定义切点、等宽分箱、等频分箱。
+"""
+
+import pandas as pd
+import numpy as np
+from typing import List, Optional, Literal, Union
+
+
+def apply_binning(
+    df: pd.DataFrame,
+    column: str,
+    method: Literal['custom', 'equal_width', 'equal_freq'],
+    new_column_name: str,
+    bins: Optional[List[Union[int, float]]] = None,
+    labels: Optional[List[Union[str, int]]] = None,
+    num_bins: int = 3
+) -> pd.DataFrame:
+    """
+    应用分箱操作
+    
+    Args:
+        df: 输入数据框
+        column: 要分箱的列名
+        method: 分箱方法
+            - 'custom': 自定义切点
+            - 'equal_width': 等宽分箱
+            - 'equal_freq': 等频分箱
+        new_column_name: 新列名
+        bins: 自定义切点列表（仅method='custom'时使用），如 [18, 60] → <18, 18-60, >60
+        labels: 标签列表（可选）
+        num_bins: 分组数量（仅method='equal_width'或'equal_freq'时使用）
+    
+    Returns:
+        分箱后的数据框
+    
+    Examples:
+        >>> df = pd.DataFrame({'年龄': [15, 25, 35, 45, 55, 65, 75]})
+        >>> result = apply_binning(df, '年龄', 'custom', '年龄分组', 
+        ...                        bins=[18, 60], labels=['青少年', '成年', '老年'])
+        >>> result['年龄分组'].tolist()
+        ['青少年', '成年', '成年', '成年', '成年', '老年', '老年']
+    """
+    if df.empty:
+        return df
+    
+    # 验证列是否存在
+    if column not in df.columns:
+        raise KeyError(f"列 '{column}' 不存在")
+    
+    # 创建结果数据框
+    result = df.copy()
+    
+    # 验证并转换数据类型
+    if not pd.api.types.is_numeric_dtype(result[column]):
+        # 尝试将字符串转换为数值
+        try:
+            result[column] = pd.to_numeric(result[column], errors='coerce')
+            print(f"警告: 列 '{column}' 已自动转换为数值类型")
+        except Exception as e:
+            raise TypeError(f"列 '{column}' 不是数值类型且无法转换，无法进行分箱")
+    
+    # 检查是否有有效的数值
+    if result[column].isna().all():
+        raise ValueError(f"列 '{column}' 中没有有效的数值，无法进行分箱")
+    
+    # 根据方法进行分箱
+    if method == 'custom':
+        # 自定义切点（用户输入的是中间切点，需要自动添加边界）
+        if not bins or len(bins) < 1:
+            raise ValueError('自定义切点至少需要1个值')
+        
+        # 验证切点是否升序
+        if bins != sorted(bins):
+            raise ValueError('切点必须按升序排列')
+        
+        # 自动添加左右边界
+        # 重要：始终添加边界，确保切点数+1=区间数
+        min_val = result[column].min()
+        max_val = result[column].max()
+        
+        print(f'用户输入切点: {bins}')
+        print(f'数据范围: [{min_val:.2f}, {max_val:.2f}]')
+        
+        # 构建完整的边界数组：始终添加左右边界
+        # 左边界：取min(用户第一个切点, 数据最小值) - 0.001
+        # 右边界：取max(用户最后一个切点, 数据最大值) + 0.001
+        left_bound = min(bins[0], min_val) - 0.001
+        right_bound = max(bins[-1], max_val) + 0.001
+        
+        full_bins = [left_bound] + bins + [right_bound]
+        
+        print(f'完整边界: {[f"{b:.1f}" for b in full_bins]}')
+        print(f'将生成 {len(full_bins) - 1} 个区间 = {len(bins) + 1} 个区间')
+        
+        # 验证标签数量（区间数 = 边界数 - 1）
+        expected_label_count = len(full_bins) - 1
+        if labels and len(labels) != expected_label_count:
+            raise ValueError(f'标签数量（{len(labels)}）必须等于区间数量（{expected_label_count}）')
+        
+        result[new_column_name] = pd.cut(
+            result[column],
+            bins=full_bins,
+            labels=labels,
+            right=False,
+            include_lowest=True
+        )
+        
+    elif method == 'equal_width':
+        # 等宽分箱
+        if num_bins < 2:
+            raise ValueError('分组数量至少为2')
+        
+        result[new_column_name] = pd.cut(
+            result[column],
+            bins=num_bins,
+            labels=labels,
+            include_lowest=True
+        )
+        
+    elif method == 'equal_freq':
+        # 等频分箱
+        if num_bins < 2:
+            raise ValueError('分组数量至少为2')
+        
+        result[new_column_name] = pd.qcut(
+            result[column],
+            q=num_bins,
+            labels=labels,
+            duplicates='drop'  # 处理重复边界值
+        )
+        
+    else:
+        raise ValueError(f"不支持的分箱方法: {method}")
+    
+    # 统计分布
+    print(f'分箱结果分布:')
+    value_counts = result[new_column_name].value_counts().sort_index()
+    for category, count in value_counts.items():
+        percentage = count / len(result) * 100
+        print(f'  {category}: {count} 行 ({percentage:.1f}%)')
+    
+    # 缺失值统计
+    missing_count = result[new_column_name].isna().sum()
+    if missing_count > 0:
+        print(f'警告: {missing_count} 个值无法分箱（可能是缺失值或边界问题）')
+    
+    return result
+
--- a/extraction_service/operations/compute.py
+++ b/extraction_service/operations/compute.py
@@ -0,0 +1,227 @@
+"""
+计算列 - 预写函数
+基于公式计算新列，支持数学运算和常用函数
+"""
+
+import pandas as pd
+import numpy as np
+import re
+from typing import Dict, Any
+
+
+# 允许的函数（安全白名单）
+ALLOWED_FUNCTIONS = {
+    'abs': abs,
+    'round': round,
+    'sqrt': np.sqrt,
+    'log': np.log,
+    'log10': np.log10,
+    'exp': np.exp,
+    'sin': np.sin,
+    'cos': np.cos,
+    'tan': np.tan,
+    'floor': np.floor,
+    'ceil': np.ceil,
+    'min': min,
+    'max': max,
+}
+
+
+def validate_formula(formula: str, available_columns: list) -> tuple[bool, str]:
+    """
+    验证公式安全性和正确性
+    
+    Args:
+        formula: 公式字符串
+        available_columns: 可用的列名列表
+    
+    Returns:
+        (is_valid, error_message)
+    """
+    # 检查是否为空
+    if not formula or not formula.strip():
+        return False, '公式不能为空'
+    
+    # 检查危险操作
+    dangerous_patterns = [
+        r'__',  # 双下划线（Python内部属性）
+        r'import\s',  # import语句
+        r'exec\s',  # exec函数
+        r'eval\s',  # eval函数
+        r'open\s*\(',  # 文件操作
+        r'compile\s*\(',  # 编译函数
+        r'globals\s*\(',  # 全局变量
+        r'locals\s*\(',  # 局部变量
+        r'__builtins__',  # 内置函数
+    ]
+    
+    for pattern in dangerous_patterns:
+        if re.search(pattern, formula, re.IGNORECASE):
+            return False, f'公式包含不允许的操作: {pattern}'
+    
+    # 检查是否只包含允许的字符
+    allowed_chars = r'[a-zA-Z0-9_\u4e00-\u9fa5\s\+\-\*/\(\)\.,\*\*]'
+    if not re.match(f'^{allowed_chars}+$', formula):
+        return False, '公式包含不允许的字符'
+    
+    return True, ''
+
+
+def compute_column(
+    df: pd.DataFrame,
+    new_column_name: str,
+    formula: str
+) -> pd.DataFrame:
+    """
+    基于公式计算新列
+    
+    Args:
+        df: 输入数据框
+        new_column_name: 新列名称
+        formula: 计算公式
+            - 支持列名引用（如：身高, 体重）
+            - 支持运算符（+, -, *, /, **, %）
+            - 支持函数（abs, round, sqrt, log, exp等）
+    
+    Returns:
+        添加了新列的数据框
+    
+    示例:
+        # BMI计算
+        compute_column(df, 'BMI', '体重 / (身高/100)**2')
+        
+        # 年龄平方根
+        compute_column(df, '年龄_sqrt', 'sqrt(年龄)')
+        
+        # 复杂公式
+        compute_column(df, '综合得分', '(FMA*0.6 + ADL*0.4) / 100')
+    """
+    result = df.copy()
+    
+    print(f'计算新列: {new_column_name}')
+    print(f'公式: {formula}')
+    print('')
+    
+    # 验证公式
+    is_valid, error_msg = validate_formula(formula, list(result.columns))
+    if not is_valid:
+        raise ValueError(f'公式验证失败: {error_msg}')
+    
+    # 准备执行环境
+    # 1. 添加数据框的列作为变量（自动转换数值类型）
+    env = {}
+    for col in result.columns:
+        # 尝试将列转换为数值类型
+        try:
+            # 如果列可以转换为数值，就转换
+            numeric_col = pd.to_numeric(result[col], errors='coerce')
+            # 如果转换后不全是NaN，说明是数值列
+            if not numeric_col.isna().all():
+                env[col] = numeric_col
+                print(f'  列 "{col}" 自动转换为数值类型')
+            else:
+                # 否则保持原样
+                env[col] = result[col]
+        except Exception:
+            # 转换失败，保持原样
+            env[col] = result[col]
+    
+    # 2. 添加允许的函数
+    env.update(ALLOWED_FUNCTIONS)
+    
+    # 3. 添加numpy（用于数学运算）
+    env['np'] = np
+    
+    try:
+        # 执行公式计算
+        result[new_column_name] = eval(formula, {"__builtins__": {}}, env)
+        
+        print(f'计算成功！')
+        print(f'新列类型: {result[new_column_name].dtype}')
+        print(f'新列前5个值:')
+        # 安全打印（避免NaN/inf导致序列化错误）
+        for idx, val in result[new_column_name].head().items():
+            if pd.isna(val):
+                print(f'  [{idx}] None (NaN)')
+            elif np.isinf(val):
+                print(f'  [{idx}] None (inf)')
+            else:
+                print(f'  [{idx}] {val}')
+        print('')
+        
+        # 统计结果
+        if pd.api.types.is_numeric_dtype(result[new_column_name]):
+            col_data = result[new_column_name]
+            
+            # 统计缺失值和无效值
+            nan_count = col_data.isna().sum()
+            inf_count = np.isinf(col_data.replace([np.nan], 0)).sum()
+            
+            print(f'统计信息:')
+            
+            # 只对有效值计算统计量
+            valid_data = col_data.dropna().replace([np.inf, -np.inf], np.nan).dropna()
+            
+            if len(valid_data) > 0:
+                print(f'  最小值: {valid_data.min():.2f}')
+                print(f'  最大值: {valid_data.max():.2f}')
+                print(f'  平均值: {valid_data.mean():.2f}')
+            else:
+                print(f'  没有有效的数值')
+            
+            if nan_count > 0:
+                print(f'  缺失值(NaN): {nan_count} 个')
+            if inf_count > 0:
+                print(f'  无穷大值(inf): {inf_count} 个')
+        else:
+            print(f'非数值类型，跳过统计')
+        
+        return result
+        
+    except NameError as e:
+        # 列名不存在
+        missing_col = str(e).split("'")[1]
+        raise ValueError(f'列 "{missing_col}" 不存在，请检查公式中的列名')
+    
+    except ZeroDivisionError:
+        raise ValueError('除零错误：公式中存在除以0的情况')
+    
+    except Exception as e:
+        raise ValueError(f'计算失败: {str(e)}')
+
+
+def get_formula_examples() -> list[Dict[str, str]]:
+    """
+    获取公式示例
+    
+    Returns:
+        示例列表
+    """
+    return [
+        {
+            'name': 'BMI计算',
+            'formula': '体重 / (身高/100)**2',
+            'description': '体重指数（需要身高(cm)和体重(kg)列）'
+        },
+        {
+            'name': '年龄分组',
+            'formula': 'round(年龄 / 10) * 10',
+            'description': '按10岁为一组（20, 30, 40...）'
+        },
+        {
+            'name': '综合得分',
+            'formula': '(FMA得分 * 0.6 + ADL得分 * 0.4)',
+            'description': '加权平均分'
+        },
+        {
+            'name': '变化率',
+            'formula': '(随访值 - 基线值) / 基线值 * 100',
+            'description': '计算变化百分比'
+        },
+        {
+            'name': '对数转换',
+            'formula': 'log(值 + 1)',
+            'description': '对数变换（处理偏态分布）'
+        },
+    ]
+
--- a/extraction_service/operations/conditional.py
+++ b/extraction_service/operations/conditional.py
@@ -0,0 +1,188 @@
+"""
+条件生成列 - 预写函数
+支持复杂的IF-THEN-ELSE多条件逻辑
+"""
+
+import pandas as pd
+from typing import List, Dict, Any, Union
+
+
+def apply_conditional_column(
+    df: pd.DataFrame,
+    new_column_name: str,
+    rules: List[Dict[str, Any]],
+    else_value: Any = None
+) -> pd.DataFrame:
+    """
+    根据多条件规则生成新列
+    
+    Args:
+        df: 输入数据框
+        new_column_name: 新列名称
+        rules: 规则列表，每个规则包含：
+            - conditions: 条件列表
+            - logic: 'and' 或 'or'
+            - result: 满足条件时的结果值
+        else_value: 所有规则都不满足时的默认值
+    
+    Returns:
+        添加了新列的数据框
+    
+    示例:
+        rules = [
+            {
+                "conditions": [
+                    {"column": "年龄", "operator": ">=", "value": 60}
+                ],
+                "logic": "and",
+                "result": "老年"
+            },
+            {
+                "conditions": [
+                    {"column": "年龄", "operator": ">=", "value": 18},
+                    {"column": "年龄", "operator": "<", "value": 60}
+                ],
+                "logic": "and",
+                "result": "成年"
+            }
+        ]
+    """
+    result = df.copy()
+    
+    # 验证规则
+    if not rules or len(rules) == 0:
+        raise ValueError('至少需要1条规则')
+    
+    # 验证所有引用的列是否存在
+    for rule in rules:
+        for condition in rule.get('conditions', []):
+            column = condition.get('column')
+            if column not in result.columns:
+                raise ValueError(f'列 "{column}" 不存在')
+    
+    # 初始化新列为else_value
+    result[new_column_name] = else_value
+    
+    print(f'开始应用条件规则，共 {len(rules)} 条规则')
+    
+    # 按顺序应用每条规则
+    for rule_idx, rule in enumerate(rules, 1):
+        conditions = rule.get('conditions', [])
+        logic = rule.get('logic', 'and')
+        result_value = rule.get('result')
+        
+        if not conditions:
+            continue
+        
+        # 构建每个条件的mask
+        masks = []
+        for condition in conditions:
+            column = condition['column']
+            operator = condition['operator']
+            value = condition['value']
+            
+            # 智能类型转换：对于数字比较运算符，尝试将列转换为数字
+            if operator in ('>', '<', '>=', '<='):
+                # 尝试将列转换为数字类型
+                try:
+                    col_data = pd.to_numeric(result[column], errors='coerce')
+                    # 确保value也是数字
+                    if not isinstance(value, (int, float)):
+                        value = float(value)
+                except Exception:
+                    # 如果转换失败，使用原始数据
+                    col_data = result[column]
+            else:
+                # 对于相等/不相等比较，使用原始数据
+                col_data = result[column]
+            
+            # 根据运算符生成mask
+            if operator == '=':
+                mask = col_data == value
+            elif operator == '!=':
+                mask = col_data != value
+            elif operator == '>':
+                mask = col_data > value
+            elif operator == '<':
+                mask = col_data < value
+            elif operator == '>=':
+                mask = col_data >= value
+            elif operator == '<=':
+                mask = col_data <= value
+            else:
+                raise ValueError(f'不支持的运算符: {operator}')
+            
+            masks.append(mask)
+        
+        # 组合条件
+        if logic == 'and':
+            final_mask = pd.concat(masks, axis=1).all(axis=1)
+        elif logic == 'or':
+            final_mask = pd.concat(masks, axis=1).any(axis=1)
+        else:
+            raise ValueError(f'不支持的逻辑运算符: {logic}')
+        
+        # 应用规则
+        matched_count = final_mask.sum()
+        result.loc[final_mask, new_column_name] = result_value
+        
+        print(f'  规则{rule_idx}: 匹配 {matched_count} 行 → 值为 {result_value}')
+    
+    # 统计结果分布
+    print(f'\n结果分布:')
+    value_counts = result[new_column_name].value_counts(dropna=False)
+    for value, count in value_counts.items():
+        percentage = count / len(result) * 100
+        if pd.isna(value):
+            print(f'  (空值): {count} 行 ({percentage:.1f}%)')
+        else:
+            print(f'  {value}: {count} 行 ({percentage:.1f}%)')
+    
+    return result
+
+
+def apply_simple_binning(
+    df: pd.DataFrame,
+    column: str,
+    new_column_name: str,
+    threshold: float,
+    value_if_true: Any = 1,
+    value_if_false: Any = 0
+) -> pd.DataFrame:
+    """
+    简单二分类（单一阈值判断）
+    
+    这是条件生成列的简化版本，用于单一阈值判断
+    
+    Args:
+        df: 输入数据框
+        column: 用于判断的列
+        new_column_name: 新列名称
+        threshold: 阈值
+        value_if_true: >= threshold时的值
+        value_if_false: < threshold时的值
+    
+    Returns:
+        添加了新列的数据框
+    
+    示例:
+        住院患者暴露分组：
+        督脉针刺持续时间 >= 10 → 1 (暴露)
+        督脉针刺持续时间 < 10 → 0 (非暴露)
+    """
+    result = df.copy()
+    
+    if column not in result.columns:
+        raise ValueError(f'列 "{column}" 不存在')
+    
+    # 简单的阈值判断
+    result[new_column_name] = (result[column] >= threshold).astype(int) * value_if_true + \
+                               (result[column] < threshold).astype(int) * value_if_false
+    
+    # 统计分布
+    print(f'简单二分类结果:')
+    print(f'  {column} >= {threshold}: {(result[column] >= threshold).sum()} 行 → {value_if_true}')
+    print(f'  {column} < {threshold}: {(result[column] < threshold).sum()} 行 → {value_if_false}')
+    
+    return result
+
--- a/extraction_service/operations/dropna.py
+++ b/extraction_service/operations/dropna.py
@@ -0,0 +1,149 @@
+"""
+删除缺失值 - 预写函数
+支持按行删除、按列删除、阈值控制
+"""
+
+import pandas as pd
+from typing import Literal, Optional, List
+
+
+def drop_missing_values(
+    df: pd.DataFrame,
+    method: Literal['row', 'column', 'both'] = 'row',
+    threshold: Optional[float] = None,
+    subset: Optional[List[str]] = None
+) -> pd.DataFrame:
+    """
+    删除缺失值
+    
+    Args:
+        df: 输入数据框
+        method: 删除方式
+            - 'row': 删除包含缺失值的行
+            - 'column': 删除缺失值过多的列
+            - 'both': 先删除列，再删除行
+        threshold: 缺失率阈值（0-1之间），仅对'column'和'both'有效
+            - 如果列的缺失率超过此阈值，则删除该列
+            - 默认为0.5（50%）
+        subset: 仅检查指定列的缺失值（仅对'row'有效）
+    
+    Returns:
+        删除缺失值后的数据框
+    
+    示例:
+        # 删除包含任何缺失值的行
+        drop_missing_values(df, method='row')
+        
+        # 删除缺失率>30%的列
+        drop_missing_values(df, method='column', threshold=0.3)
+        
+        # 先删除缺失列，再删除缺失行
+        drop_missing_values(df, method='both', threshold=0.5)
+        
+        # 仅检查指定列
+        drop_missing_values(df, method='row', subset=['年龄', 'BMI'])
+    """
+    result = df.copy()
+    original_shape = result.shape
+    
+    print(f'原始数据: {original_shape[0]} 行 × {original_shape[1]} 列')
+    print(f'缺失值总数: {result.isna().sum().sum()}')
+    print('')
+    
+    # 默认阈值
+    if threshold is None:
+        threshold = 0.5
+    
+    # 按列删除
+    if method in ('column', 'both'):
+        # 计算每列的缺失率
+        missing_rate = result.isna().sum() / len(result)
+        cols_to_drop = missing_rate[missing_rate > threshold].index.tolist()
+        
+        if cols_to_drop:
+            print(f'检测到缺失率>{threshold*100:.0f}%的列: {len(cols_to_drop)}个')
+            for col in cols_to_drop:
+                rate = missing_rate[col]
+                count = result[col].isna().sum()
+                print(f'  - {col}: 缺失率={rate*100:.1f}% ({count}/{len(result)})')
+            
+            result = result.drop(columns=cols_to_drop)
+            print(f'删除后: {result.shape[0]} 行 × {result.shape[1]} 列')
+            print('')
+        else:
+            print(f'没有找到缺失率>{threshold*100:.0f}%的列')
+            print('')
+    
+    # 按行删除
+    if method in ('row', 'both'):
+        before_rows = len(result)
+        
+        if subset:
+            # 仅检查指定列
+            print(f'仅检查指定列的缺失值: {subset}')
+            result = result.dropna(subset=subset)
+        else:
+            # 检查所有列
+            result = result.dropna()
+        
+        dropped_rows = before_rows - len(result)
+        if dropped_rows > 0:
+            print(f'删除了 {dropped_rows} 行（包含缺失值的行）')
+            print(f'保留了 {len(result)} 行（{len(result)/before_rows*100:.1f}%）')
+        else:
+            print('没有找到包含缺失值的行')
+        print('')
+    
+    # 最终统计
+    final_shape = result.shape
+    print(f'最终结果: {final_shape[0]} 行 × {final_shape[1]} 列')
+    print(f'删除了 {original_shape[0] - final_shape[0]} 行')
+    print(f'删除了 {original_shape[1] - final_shape[1]} 列')
+    print(f'剩余缺失值: {result.isna().sum().sum()}')
+    
+    # 如果结果为空，给出警告
+    if len(result) == 0:
+        print('\n⚠️  警告: 删除后数据为空！')
+    
+    return result
+
+
+def get_missing_summary(df: pd.DataFrame) -> dict:
+    """
+    获取缺失值统计摘要
+    
+    Args:
+        df: 输入数据框
+    
+    Returns:
+        缺失值统计信息
+    """
+    total_cells = df.shape[0] * df.shape[1]
+    total_missing = df.isna().sum().sum()
+    
+    # 按列统计
+    col_missing = df.isna().sum()
+    col_missing_rate = col_missing / len(df)
+    
+    cols_with_missing = col_missing[col_missing > 0].to_dict()
+    cols_missing_rate = col_missing_rate[col_missing > 0].to_dict()
+    
+    # 按行统计
+    row_missing = df.isna().sum(axis=1)
+    rows_with_missing = (row_missing > 0).sum()
+    
+    return {
+        'total_cells': total_cells,
+        'total_missing': int(total_missing),
+        'missing_rate': total_missing / total_cells if total_cells > 0 else 0,
+        'rows_with_missing': int(rows_with_missing),
+        'cols_with_missing': len(cols_with_missing),
+        'col_missing_detail': {
+            col: {
+                'count': int(count),
+                'rate': float(cols_missing_rate[col])
+            }
+            for col, count in cols_with_missing.items()
+        }
+    }
+
--- a/extraction_service/operations/filter.py
+++ b/extraction_service/operations/filter.py
@@ -0,0 +1,109 @@
+"""
+高级筛选操作
+
+提供多条件筛选功能，支持AND/OR逻辑组合。
+"""
+
+import pandas as pd
+from typing import List, Dict, Any, Literal
+
+
+def apply_filter(
+    df: pd.DataFrame,
+    conditions: List[Dict[str, Any]],
+    logic: Literal['and', 'or'] = 'and'
+) -> pd.DataFrame:
+    """
+    应用筛选条件
+    
+    Args:
+        df: 输入数据框
+        conditions: 筛选条件列表，每个条件包含：
+            - column: 列名
+            - operator: 运算符 (=, !=, >, <, >=, <=, contains, not_contains, 
+                       starts_with, ends_with, is_null, not_null)
+            - value: 值（is_null和not_null不需要）
+        logic: 逻辑组合方式 ('and' 或 'or')
+    
+    Returns:
+        筛选后的数据框
+    
+    Examples:
+        >>> df = pd.DataFrame({'年龄': [25, 35, 45], '性别': ['男', '女', '男']})
+        >>> conditions = [
+        ...     {'column': '年龄', 'operator': '>', 'value': 30},
+        ...     {'column': '性别', 'operator': '=', 'value': '男'}
+        ... ]
+        >>> result = apply_filter(df, conditions, logic='and')
+        >>> len(result)
+        1
+    """
+    if not conditions:
+        raise ValueError('筛选条件不能为空')
+    
+    if df.empty:
+        return df
+    
+    # 生成各个条件的mask
+    masks = []
+    for cond in conditions:
+        column = cond['column']
+        operator = cond['operator']
+        value = cond.get('value')
+        
+        # 验证列是否存在
+        if column not in df.columns:
+            raise KeyError(f"列 '{column}' 不存在")
+        
+        # 根据运算符生成mask
+        if operator == '=':
+            mask = df[column] == value
+        elif operator == '!=':
+            mask = df[column] != value
+        elif operator == '>':
+            mask = df[column] > value
+        elif operator == '<':
+            mask = df[column] < value
+        elif operator == '>=':
+            mask = df[column] >= value
+        elif operator == '<=':
+            mask = df[column] <= value
+        elif operator == 'contains':
+            mask = df[column].astype(str).str.contains(str(value), na=False)
+        elif operator == 'not_contains':
+            mask = ~df[column].astype(str).str.contains(str(value), na=False)
+        elif operator == 'starts_with':
+            mask = df[column].astype(str).str.startswith(str(value), na=False)
+        elif operator == 'ends_with':
+            mask = df[column].astype(str).str.endswith(str(value), na=False)
+        elif operator == 'is_null':
+            mask = df[column].isna()
+        elif operator == 'not_null':
+            mask = df[column].notna()
+        else:
+            raise ValueError(f"不支持的运算符: {operator}")
+        
+        masks.append(mask)
+    
+    # 组合所有条件
+    if logic == 'and':
+        final_mask = pd.concat(masks, axis=1).all(axis=1)
+    elif logic == 'or':
+        final_mask = pd.concat(masks, axis=1).any(axis=1)
+    else:
+        raise ValueError(f"不支持的逻辑运算: {logic}")
+    
+    # 应用筛选
+    result = df[final_mask].copy()
+    
+    # 打印统计信息
+    original_rows = len(df)
+    filtered_rows = len(result)
+    removed_rows = original_rows - filtered_rows
+    
+    print(f'原始数据: {original_rows} 行')
+    print(f'筛选后: {filtered_rows} 行')
+    print(f'删除: {removed_rows} 行 ({removed_rows/original_rows*100:.1f}%)')
+    
+    return result
+
--- a/extraction_service/operations/pivot.py
+++ b/extraction_service/operations/pivot.py
@@ -0,0 +1,161 @@
+"""
+Pivot操作 - 预写函数
+长表转宽表（一人多行 → 一人一行）
+"""
+
+import pandas as pd
+from typing import List, Literal, Optional
+
+
+def pivot_long_to_wide(
+    df: pd.DataFrame,
+    index_column: str,
+    pivot_column: str,
+    value_columns: List[str],
+    aggfunc: Literal['first', 'last', 'mean', 'sum', 'min', 'max'] = 'first'
+) -> pd.DataFrame:
+    """
+    长表转宽表（Pivot）
+    
+    将纵向重复的数据转为横向数据
+    
+    Args:
+        df: 输入数据框
+        index_column: 索引列（唯一标识，如 Record ID）
+        pivot_column: 透视列（将变成新列名的列，如 Event Name）
+        value_columns: 值列（要转置的数据列，如 FMA得分, ADL得分）
+        aggfunc: 聚合函数
+            - 'first': 取第一个值（推荐）
+            - 'last': 取最后一个值
+            - 'mean': 求平均值
+            - 'sum': 求和
+            - 'min': 取最小值
+            - 'max': 取最大值
+    
+    Returns:
+        宽表数据框
+    
+    示例:
+        pivot_long_to_wide(
+            df,
+            index_column='Record ID',
+            pivot_column='Event Name',
+            value_columns=['FMA得分', 'ADL得分'],
+            aggfunc='first'
+        )
+    """
+    result = df.copy()
+    
+    print(f'原始数据: {len(result)} 行 × {len(result.columns)} 列')
+    print(f'索引列: {index_column}')
+    print(f'透视列: {pivot_column}')
+    print(f'值列: {", ".join(value_columns)}')
+    print(f'聚合方式: {aggfunc}')
+    print('')
+    
+    # 验证列是否存在
+    required_cols = [index_column, pivot_column] + value_columns
+    missing_cols = [col for col in required_cols if col not in result.columns]
+    if missing_cols:
+        raise ValueError(f'以下列不存在: {", ".join(missing_cols)}')
+    
+    # 检查索引列的唯一值数量
+    unique_index = result[index_column].nunique()
+    print(f'唯一{index_column}数量: {unique_index}')
+    
+    # 检查透视列的唯一值
+    unique_pivot = result[pivot_column].unique()
+    print(f'透视列"{pivot_column}"的唯一值: {list(unique_pivot)}')
+    print('')
+    
+    try:
+        # 执行Pivot转换
+        df_pivot = result.pivot_table(
+            index=index_column,
+            columns=pivot_column,
+            values=value_columns,
+            aggfunc=aggfunc
+        )
+        
+        # 展平多级列名
+        # 如果只有一个值列，列名是单层的
+        if len(value_columns) == 1:
+            df_pivot.columns = [f'{value_columns[0]}_{col}' for col in df_pivot.columns]
+        else:
+            # 多个值列，列名是多层的，需要展平
+            df_pivot.columns = ['_'.join(str(c) for c in col).strip() for col in df_pivot.columns.values]
+        
+        # 重置索引（将index列变回普通列）
+        df_pivot = df_pivot.reset_index()
+        
+        print(f'转换成功！')
+        print(f'结果: {len(df_pivot)} 行 × {len(df_pivot.columns)} 列')
+        print(f'新增列: {len(df_pivot.columns) - 1} 列')
+        print('')
+        
+        # 显示新列名
+        print(f'生成的列名:')
+        new_cols = [col for col in df_pivot.columns if col != index_column]
+        for i, col in enumerate(new_cols[:10], 1):  # 只显示前10个
+            print(f'  {i}. {col}')
+        if len(new_cols) > 10:
+            print(f'  ... 还有 {len(new_cols) - 10} 列')
+        
+        return df_pivot
+        
+    except ValueError as e:
+        # Pivot失败（可能有重复的index+pivot组合）
+        if 'Index contains duplicate entries' in str(e):
+            # 统计重复情况
+            duplicates = result.groupby([index_column, pivot_column]).size()
+            duplicates = duplicates[duplicates > 1]
+            
+            print('⚠️  警告: 发现重复的索引+透视组合:')
+            for (idx, piv), count in duplicates.head(5).items():
+                print(f'  {index_column}={idx}, {pivot_column}={piv}: {count}次')
+            
+            if len(duplicates) > 5:
+                print(f'  ... 还有 {len(duplicates) - 5} 个重复组合')
+            
+            print(f'\n建议: 使用聚合函数（如mean、sum）处理重复值')
+            print(f'当前聚合方式: {aggfunc}')
+            
+            raise ValueError(f'存在重复的{index_column}+{pivot_column}组合，需要选择合适的聚合方式')
+        else:
+            raise e
+
+
+def get_pivot_preview(
+    df: pd.DataFrame,
+    index_column: str,
+    pivot_column: str
+) -> dict:
+    """
+    获取Pivot预览信息
+    
+    Args:
+        df: 输入数据框
+        index_column: 索引列
+        pivot_column: 透视列
+    
+    Returns:
+        预览信息
+    """
+    # 统计唯一值
+    unique_index = df[index_column].nunique()
+    unique_pivot = df[pivot_column].unique()
+    
+    # 检查是否有重复
+    duplicates = df.groupby([index_column, pivot_column]).size()
+    has_duplicates = (duplicates > 1).any()
+    duplicate_count = (duplicates > 1).sum() if has_duplicates else 0
+    
+    return {
+        'unique_index_count': int(unique_index),
+        'unique_pivot_values': [str(v) for v in unique_pivot],
+        'has_duplicates': bool(has_duplicates),
+        'duplicate_count': int(duplicate_count),
+        'estimated_rows': int(unique_index),
+        'estimated_columns': len(unique_pivot)
+    }
+
--- a/extraction_service/operations/recode.py
+++ b/extraction_service/operations/recode.py
@@ -0,0 +1,79 @@
+"""
+数值映射（重编码）操作
+
+将分类变量的原始值映射为新值（如：男→1，女→2）。
+"""
+
+import pandas as pd
+from typing import Dict, Any, Optional
+
+
+def apply_recode(
+    df: pd.DataFrame,
+    column: str,
+    mapping: Dict[Any, Any],
+    create_new_column: bool = True,
+    new_column_name: Optional[str] = None
+) -> pd.DataFrame:
+    """
+    应用数值映射
+    
+    Args:
+        df: 输入数据框
+        column: 要重编码的列名
+        mapping: 映射字典，如 {'男': 1, '女': 2}
+        create_new_column: 是否创建新列（True）或覆盖原列（False）
+        new_column_name: 新列名（create_new_column=True时使用）
+    
+    Returns:
+        重编码后的数据框
+    
+    Examples:
+        >>> df = pd.DataFrame({'性别': ['男', '女', '男', '女']})
+        >>> mapping = {'男': 1, '女': 2}
+        >>> result = apply_recode(df, '性别', mapping, True, '性别_编码')
+        >>> result['性别_编码'].tolist()
+        [1, 2, 1, 2]
+    """
+    if df.empty:
+        return df
+    
+    # 验证列是否存在
+    if column not in df.columns:
+        raise KeyError(f"列 '{column}' 不存在")
+    
+    if not mapping:
+        raise ValueError('映射字典不能为空')
+    
+    # 确定目标列名
+    if create_new_column:
+        target_column = new_column_name or f'{column}_编码'
+    else:
+        target_column = column
+    
+    # 创建结果数据框（避免修改原数据）
+    result = df.copy()
+    
+    # 应用映射
+    result[target_column] = result[column].map(mapping)
+    
+    # 统计结果
+    mapped_count = result[target_column].notna().sum()
+    unmapped_count = result[target_column].isna().sum()
+    total_count = len(result)
+    
+    print(f'映射完成: {mapped_count} 个值成功映射')
+    
+    if unmapped_count > 0:
+        print(f'警告: {unmapped_count} 个值未找到对应映射')
+        # 找出未映射的唯一值
+        unmapped_mask = result[target_column].isna()
+        unmapped_values = result.loc[unmapped_mask, column].unique()
+        print(f'未映射的值: {list(unmapped_values)[:10]}')  # 最多显示10个
+    
+    # 映射成功率
+    success_rate = (mapped_count / total_count * 100) if total_count > 0 else 0
+    print(f'映射成功率: {success_rate:.1f}%')
+    
+    return result
+
--- a/extraction_service/test_dc_api.py
+++ b/extraction_service/test_dc_api.py
@@ -279,3 +279,5 @@ if __name__ == "__main__":
    main()


+
+
--- a/extraction_service/test_execute_simple.py
+++ b/extraction_service/test_execute_simple.py
@@ -45,3 +45,5 @@ except Exception as e:
    print(f"\n❌ 测试异常: {str(e)}")


+
+
--- a/extraction_service/test_module.py
+++ b/extraction_service/test_module.py
@@ -25,3 +25,5 @@ except Exception as e:
    traceback.print_exc()


+
+
				`@@ -45,3 +45,5 @@ except Exception as e:`
				`print(f"\n❌ 测试异常: {str(e)}")`
				`@@ -25,3 +25,5 @@ except Exception as e:`
				`traceback.print_exc()`