feat(dc/tool-c): Add pivot column ordering and NA handling features

Major features: 1. Pivot transformation enhancements: - Add option to keep unselected columns with 3 aggregation methods - Maintain original column order after pivot (aligned with source file) - Preserve pivot value order (first appearance order) 2. NA handling across 4 core functions: - Recode: Support keep/map/drop for NA values - Filter: Already supports is_null/not_null operators - Binning: Support keep/label/assign for NA values (fix nan display) - Conditional: Add is_null/not_null operators 3. UI improvements: - Enable column header tooltips with custom header component - Add closeable alert for 50-row preview - Fix page scrollbar issues Modified files: Python: pivot.py, recode.py, binning.py, conditional.py, main.py Backend: SessionController, QuickActionController, QuickActionService Frontend: PivotDialog, RecodeDialog, BinningDialog, ConditionalDialog, DataGrid, index Status: Ready for testing
2025-12-09 14:40:14 +08:00
parent 75ceeb0653
commit f4f1d09837
19 changed files with 2314 additions and 123 deletions
--- a/extraction_service/operations/recode.py
+++ b/extraction_service/operations/recode.py
@@ -5,7 +5,8 @@
 """

 import pandas as pd
-from typing import Dict, Any, Optional
+import numpy as np
+from typing import Dict, Any, Optional, Literal


 def apply_recode(
@@ -13,7 +14,9 @@ def apply_recode(
    column: str,
    mapping: Dict[Any, Any],
    create_new_column: bool = True,
-    new_column_name: Optional[str] = None
+    new_column_name: Optional[str] = None,
+    na_handling: Literal['keep', 'map', 'drop'] = 'keep',
+    na_value: Any = None
 ) -> pd.DataFrame:
    """
    应用数值映射
@@ -24,16 +27,21 @@ def apply_recode(
        mapping: 映射字典，如 {'男': 1, '女': 2}
        create_new_column: 是否创建新列（True）或覆盖原列（False）
        new_column_name: 新列名（create_new_column=True时使用）
+        na_handling: NA值处理方式
+            - 'keep': 保持为NA（默认）
+            - 'map': 映射为指定值
+            - 'drop': 删除包含NA的行
+        na_value: 当na_handling='map'时，NA映射到的值
    
    Returns:
        重编码后的数据框
    
    Examples:
-        >>> df = pd.DataFrame({'性别': ['男', '女', '男', '女']})
+        >>> df = pd.DataFrame({'性别': ['男', '女', '男', None]})
        >>> mapping = {'男': 1, '女': 2}
-        >>> result = apply_recode(df, '性别', mapping, True, '性别_编码')
+        >>> result = apply_recode(df, '性别', mapping, True, '性别_编码', na_handling='map', na_value=0)
        >>> result['性别_编码'].tolist()
-        [1, 2, 1, 2]
+        [1, 2, 1, 0]
    """
    if df.empty:
        return df
@@ -54,6 +62,9 @@ def apply_recode(
    # 创建结果数据框（避免修改原数据）
    result = df.copy()
    
+    # ✨ 统计原始NA数量
+    original_na_count = result[column].isna().sum()
+    
    # ✨ 优化：如果是创建新列，插入到原列旁边
    if create_new_column:
        original_col_index = result.columns.get_loc(column)
@@ -62,6 +73,26 @@ def apply_recode(
        # 覆盖原列
        result[target_column] = result[column].map(mapping)
    
+    # ✨ 处理NA值
+    if original_na_count > 0:
+        na_mask = result[column].isna()
+        
+        if na_handling == 'keep':
+            # 保持为NA（已经是NA，无需操作）
+            print(f'📊 NA处理：保持为NA（{original_na_count}个）')
+        
+        elif na_handling == 'map':
+            # 映射为指定值
+            result.loc[na_mask, target_column] = na_value
+            print(f'📊 NA处理：映射为 {na_value}（{original_na_count}个）')
+        
+        elif na_handling == 'drop':
+            # 删除包含NA的行
+            rows_before = len(result)
+            result = result[~na_mask].copy()
+            rows_after = len(result)
+            print(f'📊 NA处理：删除包含NA的行（删除{rows_before - rows_after}行）')
+    
    # 统计结果
    mapped_count = result[target_column].notna().sum()
    unmapped_count = result[target_column].isna().sum()