feat(dc/tool-c): Add pivot column ordering and NA handling features

Major features: 1. Pivot transformation enhancements: - Add option to keep unselected columns with 3 aggregation methods - Maintain original column order after pivot (aligned with source file) - Preserve pivot value order (first appearance order) 2. NA handling across 4 core functions: - Recode: Support keep/map/drop for NA values - Filter: Already supports is_null/not_null operators - Binning: Support keep/label/assign for NA values (fix nan display) - Conditional: Add is_null/not_null operators 3. UI improvements: - Enable column header tooltips with custom header component - Add closeable alert for 50-row preview - Fix page scrollbar issues Modified files: Python: pivot.py, recode.py, binning.py, conditional.py, main.py Backend: SessionController, QuickActionController, QuickActionService Frontend: PivotDialog, RecodeDialog, BinningDialog, ConditionalDialog, DataGrid, index Status: Ready for testing
2025-12-09 14:40:14 +08:00
parent 75ceeb0653
commit f4f1d09837
19 changed files with 2314 additions and 123 deletions
--- a/extraction_service/operations/binning.py
+++ b/extraction_service/operations/binning.py
@@ -17,7 +17,10 @@ def apply_binning(
    new_column_name: str,
    bins: Optional[List[Union[int, float]]] = None,
    labels: Optional[List[Union[str, int]]] = None,
-    num_bins: int = 3
+    num_bins: int = 3,
+    na_handling: Literal['keep', 'label', 'assign'] = 'keep',
+    na_label: Optional[str] = None,
+    na_assign_to: Optional[int] = None
 ) -> pd.DataFrame:
    """
    应用分箱操作
@@ -33,16 +36,23 @@ def apply_binning(
        bins: 自定义切点列表（仅method='custom'时使用），如 [18, 60] → <18, 18-60, >60
        labels: 标签列表（可选）
        num_bins: 分组数量（仅method='equal_width'或'equal_freq'时使用）
+        na_handling: NA值处理方式
+            - 'keep': 保持为NA（默认）
+            - 'label': 标记为指定标签
+            - 'assign': 分配到指定组
+        na_label: 当na_handling='label'时，NA的标签（如"缺失"）
+        na_assign_to: 当na_handling='assign'时，NA分配到的组索引
    
    Returns:
        分箱后的数据框
    
    Examples:
-        >>> df = pd.DataFrame({'年龄': [15, 25, 35, 45, 55, 65, 75]})
+        >>> df = pd.DataFrame({'年龄': [15, 25, 35, 45, 55, 65, 75, None]})
        >>> result = apply_binning(df, '年龄', 'custom', '年龄分组', 
-        ...                        bins=[18, 60], labels=['青少年', '成年', '老年'])
+        ...                        bins=[18, 60], labels=['青少年', '成年', '老年'],
+        ...                        na_handling='label', na_label='缺失')
        >>> result['年龄分组'].tolist()
-        ['青少年', '成年', '成年', '成年', '成年', '老年', '老年']
+        ['青少年', '成年', '成年', '成年', '成年', '老年', '老年', '缺失']
    """
    if df.empty:
        return df
@@ -54,6 +64,10 @@ def apply_binning(
    # 创建结果数据框
    result = df.copy()
    
+    # ✨ 记录原始NA的位置（在分箱前）
+    original_na_mask = result[column].isna()
+    original_na_count = original_na_mask.sum()
+    
    # 验证并转换数据类型
    if not pd.api.types.is_numeric_dtype(result[column]):
        # 尝试将字符串转换为数值
@@ -136,6 +150,9 @@ def apply_binning(
    else:
        raise ValueError(f"不支持的分箱方法: {method}")
    
+    # ✨ 重要：将Categorical类型转换为object类型，避免"nan"字符串问题
+    result[new_column_name] = result[new_column_name].astype('object')
+    
    # ✨ 优化：将新列移到原列旁边
    original_col_index = result.columns.get_loc(column)
    cols = list(result.columns)
@@ -145,6 +162,27 @@ def apply_binning(
    cols.insert(original_col_index + 1, new_column_name)
    result = result[cols]
    
+    # ✨ 处理NA值（使用分箱前记录的NA位置）
+    if original_na_count > 0:
+        if na_handling == 'keep':
+            # 保持为NA（显式设置为None，避免显示为"nan"字符串）
+            result.loc[original_na_mask, new_column_name] = None
+            print(f'📊 NA处理：保持为NA（{original_na_count}个）', flush=True)
+        
+        elif na_handling == 'label':
+            # 标记为指定标签
+            label_to_use = na_label if na_label else '空值/NA'
+            result.loc[original_na_mask, new_column_name] = label_to_use
+            print(f'📊 NA处理：标记为 "{label_to_use}"（{original_na_count}个）', flush=True)
+        
+        elif na_handling == 'assign':
+            # 分配到指定组（通过labels）
+            if labels and na_assign_to is not None and 0 <= na_assign_to < len(labels):
+                result.loc[original_na_mask, new_column_name] = labels[na_assign_to]
+                print(f'📊 NA处理：分配到组 "{labels[na_assign_to]}"（{original_na_count}个）', flush=True)
+            else:
+                print(f'⚠️  警告：na_assign_to无效，NA保持为空', flush=True)
+    
    # 统计分布
    print(f'分箱结果分布:')
    value_counts = result[new_column_name].value_counts().sort_index()