feat(dc): Complete Tool C quick action buttons Phase 1-2 - 7 functions

Summary: - Implement 7 quick action functions (filter, recode, binning, conditional, dropna, compute, pivot) - Refactor to pre-written Python functions architecture (stable and secure) - Add 7 Python operations modules with full type hints - Add 7 frontend Dialog components with user-friendly UI - Fix NaN serialization issues and auto type conversion - Update all related documentation Technical Details: - Python: operations/ module (filter.py, recode.py, binning.py, conditional.py, dropna.py, compute.py, pivot.py) - Backend: QuickActionService.ts with 7 execute methods - Frontend: 7 Dialog components with complete validation - Toolbar: Enable 7 quick action buttons Status: Phase 1-2 completed, basic testing passed, ready for further testing
2025-12-08 17:38:08 +08:00
parent af325348b8
commit f729699510
158 changed files with 13814 additions and 273 deletions
--- a/python-microservice/operations/init.py
+++ b/python-microservice/operations/init.py
@@ -0,0 +1,16 @@
+"""
+数据操作函数模块
+
+提供预写的、经过测试的数据处理函数，供功能按钮调用。
+
+模块列表：
+- filter: 高级筛选
+- recode: 数值映射（重编码）
+- binning: 生成分类变量（分箱）
+- conditional: 条件生成列
+- missing: 缺失值处理
+- duplicate: 去重
+"""
+
+__version__ = '1.0.0'
+
--- a/python-microservice/operations/binning.py
+++ b/python-microservice/operations/binning.py
@@ -0,0 +1,123 @@
+"""
+生成分类变量（分箱）操作
+
+将连续数值变量转换为分类变量。
+支持三种方法：自定义切点、等宽分箱、等频分箱。
+"""
+
+import pandas as pd
+import numpy as np
+from typing import List, Optional, Literal, Union
+
+
+def apply_binning(
+    df: pd.DataFrame,
+    column: str,
+    method: Literal['custom', 'equal_width', 'equal_freq'],
+    new_column_name: str,
+    bins: Optional[List[Union[int, float]]] = None,
+    labels: Optional[List[Union[str, int]]] = None,
+    num_bins: int = 3
+) -> pd.DataFrame:
+    """
+    应用分箱操作
+    
+    Args:
+        df: 输入数据框
+        column: 要分箱的列名
+        method: 分箱方法
+            - 'custom': 自定义切点
+            - 'equal_width': 等宽分箱
+            - 'equal_freq': 等频分箱
+        new_column_name: 新列名
+        bins: 自定义切点列表（仅method='custom'时使用），如 [18, 60] → <18, 18-60, >60
+        labels: 标签列表（可选）
+        num_bins: 分组数量（仅method='equal_width'或'equal_freq'时使用）
+    
+    Returns:
+        分箱后的数据框
+    
+    Examples:
+        >>> df = pd.DataFrame({'年龄': [15, 25, 35, 45, 55, 65, 75]})
+        >>> result = apply_binning(df, '年龄', 'custom', '年龄分组', 
+        ...                        bins=[18, 60], labels=['青少年', '成年', '老年'])
+        >>> result['年龄分组'].tolist()
+        ['青少年', '成年', '成年', '成年', '成年', '老年', '老年']
+    """
+    if df.empty:
+        return df
+    
+    # 验证列是否存在
+    if column not in df.columns:
+        raise KeyError(f"列 '{column}' 不存在")
+    
+    # 验证数据类型
+    if not pd.api.types.is_numeric_dtype(df[column]):
+        raise TypeError(f"列 '{column}' 不是数值类型，无法进行分箱")
+    
+    # 创建结果数据框
+    result = df.copy()
+    
+    # 根据方法进行分箱
+    if method == 'custom':
+        # 自定义切点
+        if not bins or len(bins) < 2:
+            raise ValueError('自定义切点至少需要2个值')
+        
+        # 验证切点是否升序
+        if bins != sorted(bins):
+            raise ValueError('切点必须按升序排列')
+        
+        # 验证标签数量
+        if labels and len(labels) != len(bins) - 1:
+            raise ValueError(f'标签数量（{len(labels)}）必须等于切点数量-1（{len(bins)-1}）')
+        
+        result[new_column_name] = pd.cut(
+            result[column],
+            bins=bins,
+            labels=labels,
+            right=False,
+            include_lowest=True
+        )
+        
+    elif method == 'equal_width':
+        # 等宽分箱
+        if num_bins < 2:
+            raise ValueError('分组数量至少为2')
+        
+        result[new_column_name] = pd.cut(
+            result[column],
+            bins=num_bins,
+            labels=labels,
+            include_lowest=True
+        )
+        
+    elif method == 'equal_freq':
+        # 等频分箱
+        if num_bins < 2:
+            raise ValueError('分组数量至少为2')
+        
+        result[new_column_name] = pd.qcut(
+            result[column],
+            q=num_bins,
+            labels=labels,
+            duplicates='drop'  # 处理重复边界值
+        )
+        
+    else:
+        raise ValueError(f"不支持的分箱方法: {method}")
+    
+    # 统计分布
+    print(f'分箱结果分布:')
+    value_counts = result[new_column_name].value_counts().sort_index()
+    for category, count in value_counts.items():
+        percentage = count / len(result) * 100
+        print(f'  {category}: {count} 行 ({percentage:.1f}%)')
+    
+    # 缺失值统计
+    missing_count = result[new_column_name].isna().sum()
+    if missing_count > 0:
+        print(f'警告: {missing_count} 个值无法分箱（可能是缺失值或边界问题）')
+    
+    return result
+
--- a/python-microservice/operations/filter.py
+++ b/python-microservice/operations/filter.py
@@ -0,0 +1,109 @@
+"""
+高级筛选操作
+
+提供多条件筛选功能，支持AND/OR逻辑组合。
+"""
+
+import pandas as pd
+from typing import List, Dict, Any, Literal
+
+
+def apply_filter(
+    df: pd.DataFrame,
+    conditions: List[Dict[str, Any]],
+    logic: Literal['and', 'or'] = 'and'
+) -> pd.DataFrame:
+    """
+    应用筛选条件
+    
+    Args:
+        df: 输入数据框
+        conditions: 筛选条件列表，每个条件包含：
+            - column: 列名
+            - operator: 运算符 (=, !=, >, <, >=, <=, contains, not_contains, 
+                       starts_with, ends_with, is_null, not_null)
+            - value: 值（is_null和not_null不需要）
+        logic: 逻辑组合方式 ('and' 或 'or')
+    
+    Returns:
+        筛选后的数据框
+    
+    Examples:
+        >>> df = pd.DataFrame({'年龄': [25, 35, 45], '性别': ['男', '女', '男']})
+        >>> conditions = [
+        ...     {'column': '年龄', 'operator': '>', 'value': 30},
+        ...     {'column': '性别', 'operator': '=', 'value': '男'}
+        ... ]
+        >>> result = apply_filter(df, conditions, logic='and')
+        >>> len(result)
+        1
+    """
+    if not conditions:
+        raise ValueError('筛选条件不能为空')
+    
+    if df.empty:
+        return df
+    
+    # 生成各个条件的mask
+    masks = []
+    for cond in conditions:
+        column = cond['column']
+        operator = cond['operator']
+        value = cond.get('value')
+        
+        # 验证列是否存在
+        if column not in df.columns:
+            raise KeyError(f"列 '{column}' 不存在")
+        
+        # 根据运算符生成mask
+        if operator == '=':
+            mask = df[column] == value
+        elif operator == '!=':
+            mask = df[column] != value
+        elif operator == '>':
+            mask = df[column] > value
+        elif operator == '<':
+            mask = df[column] < value
+        elif operator == '>=':
+            mask = df[column] >= value
+        elif operator == '<=':
+            mask = df[column] <= value
+        elif operator == 'contains':
+            mask = df[column].astype(str).str.contains(str(value), na=False)
+        elif operator == 'not_contains':
+            mask = ~df[column].astype(str).str.contains(str(value), na=False)
+        elif operator == 'starts_with':
+            mask = df[column].astype(str).str.startswith(str(value), na=False)
+        elif operator == 'ends_with':
+            mask = df[column].astype(str).str.endswith(str(value), na=False)
+        elif operator == 'is_null':
+            mask = df[column].isna()
+        elif operator == 'not_null':
+            mask = df[column].notna()
+        else:
+            raise ValueError(f"不支持的运算符: {operator}")
+        
+        masks.append(mask)
+    
+    # 组合所有条件
+    if logic == 'and':
+        final_mask = pd.concat(masks, axis=1).all(axis=1)
+    elif logic == 'or':
+        final_mask = pd.concat(masks, axis=1).any(axis=1)
+    else:
+        raise ValueError(f"不支持的逻辑运算: {logic}")
+    
+    # 应用筛选
+    result = df[final_mask].copy()
+    
+    # 打印统计信息
+    original_rows = len(df)
+    filtered_rows = len(result)
+    removed_rows = original_rows - filtered_rows
+    
+    print(f'原始数据: {original_rows} 行')
+    print(f'筛选后: {filtered_rows} 行')
+    print(f'删除: {removed_rows} 行 ({removed_rows/original_rows*100:.1f}%)')
+    
+    return result
+
--- a/python-microservice/operations/recode.py
+++ b/python-microservice/operations/recode.py
@@ -0,0 +1,79 @@
+"""
+数值映射（重编码）操作
+
+将分类变量的原始值映射为新值（如：男→1，女→2）。
+"""
+
+import pandas as pd
+from typing import Dict, Any, Optional
+
+
+def apply_recode(
+    df: pd.DataFrame,
+    column: str,
+    mapping: Dict[Any, Any],
+    create_new_column: bool = True,
+    new_column_name: Optional[str] = None
+) -> pd.DataFrame:
+    """
+    应用数值映射
+    
+    Args:
+        df: 输入数据框
+        column: 要重编码的列名
+        mapping: 映射字典，如 {'男': 1, '女': 2}
+        create_new_column: 是否创建新列（True）或覆盖原列（False）
+        new_column_name: 新列名（create_new_column=True时使用）
+    
+    Returns:
+        重编码后的数据框
+    
+    Examples:
+        >>> df = pd.DataFrame({'性别': ['男', '女', '男', '女']})
+        >>> mapping = {'男': 1, '女': 2}
+        >>> result = apply_recode(df, '性别', mapping, True, '性别_编码')
+        >>> result['性别_编码'].tolist()
+        [1, 2, 1, 2]
+    """
+    if df.empty:
+        return df
+    
+    # 验证列是否存在
+    if column not in df.columns:
+        raise KeyError(f"列 '{column}' 不存在")
+    
+    if not mapping:
+        raise ValueError('映射字典不能为空')
+    
+    # 确定目标列名
+    if create_new_column:
+        target_column = new_column_name or f'{column}_编码'
+    else:
+        target_column = column
+    
+    # 创建结果数据框（避免修改原数据）
+    result = df.copy()
+    
+    # 应用映射
+    result[target_column] = result[column].map(mapping)
+    
+    # 统计结果
+    mapped_count = result[target_column].notna().sum()
+    unmapped_count = result[target_column].isna().sum()
+    total_count = len(result)
+    
+    print(f'映射完成: {mapped_count} 个值成功映射')
+    
+    if unmapped_count > 0:
+        print(f'警告: {unmapped_count} 个值未找到对应映射')
+        # 找出未映射的唯一值
+        unmapped_mask = result[target_column].isna()
+        unmapped_values = result.loc[unmapped_mask, column].unique()
+        print(f'未映射的值: {list(unmapped_values)[:10]}')  # 最多显示10个
+    
+    # 映射成功率
+    success_rate = (mapped_count / total_count * 100) if total_count > 0 else 0
+    print(f'映射成功率: {success_rate:.1f}%')
+    
+    return result
+