feat(dc): Complete Tool C quick action buttons Phase 1-2 - 7 functions

Summary: - Implement 7 quick action functions (filter, recode, binning, conditional, dropna, compute, pivot) - Refactor to pre-written Python functions architecture (stable and secure) - Add 7 Python operations modules with full type hints - Add 7 frontend Dialog components with user-friendly UI - Fix NaN serialization issues and auto type conversion - Update all related documentation Technical Details: - Python: operations/ module (filter.py, recode.py, binning.py, conditional.py, dropna.py, compute.py, pivot.py) - Backend: QuickActionService.ts with 7 execute methods - Frontend: 7 Dialog components with complete validation - Toolbar: Enable 7 quick action buttons Status: Phase 1-2 completed, basic testing passed, ready for further testing
2025-12-08 17:38:08 +08:00
parent af325348b8
commit f729699510
158 changed files with 13814 additions and 273 deletions
--- a/extraction_service/operations/dropna.py
+++ b/extraction_service/operations/dropna.py
@@ -0,0 +1,149 @@
+"""
+删除缺失值 - 预写函数
+支持按行删除、按列删除、阈值控制
+"""
+
+import pandas as pd
+from typing import Literal, Optional, List
+
+
+def drop_missing_values(
+    df: pd.DataFrame,
+    method: Literal['row', 'column', 'both'] = 'row',
+    threshold: Optional[float] = None,
+    subset: Optional[List[str]] = None
+) -> pd.DataFrame:
+    """
+    删除缺失值
+    
+    Args:
+        df: 输入数据框
+        method: 删除方式
+            - 'row': 删除包含缺失值的行
+            - 'column': 删除缺失值过多的列
+            - 'both': 先删除列，再删除行
+        threshold: 缺失率阈值（0-1之间），仅对'column'和'both'有效
+            - 如果列的缺失率超过此阈值，则删除该列
+            - 默认为0.5（50%）
+        subset: 仅检查指定列的缺失值（仅对'row'有效）
+    
+    Returns:
+        删除缺失值后的数据框
+    
+    示例:
+        # 删除包含任何缺失值的行
+        drop_missing_values(df, method='row')
+        
+        # 删除缺失率>30%的列
+        drop_missing_values(df, method='column', threshold=0.3)
+        
+        # 先删除缺失列，再删除缺失行
+        drop_missing_values(df, method='both', threshold=0.5)
+        
+        # 仅检查指定列
+        drop_missing_values(df, method='row', subset=['年龄', 'BMI'])
+    """
+    result = df.copy()
+    original_shape = result.shape
+    
+    print(f'原始数据: {original_shape[0]} 行 × {original_shape[1]} 列')
+    print(f'缺失值总数: {result.isna().sum().sum()}')
+    print('')
+    
+    # 默认阈值
+    if threshold is None:
+        threshold = 0.5
+    
+    # 按列删除
+    if method in ('column', 'both'):
+        # 计算每列的缺失率
+        missing_rate = result.isna().sum() / len(result)
+        cols_to_drop = missing_rate[missing_rate > threshold].index.tolist()
+        
+        if cols_to_drop:
+            print(f'检测到缺失率>{threshold*100:.0f}%的列: {len(cols_to_drop)}个')
+            for col in cols_to_drop:
+                rate = missing_rate[col]
+                count = result[col].isna().sum()
+                print(f'  - {col}: 缺失率={rate*100:.1f}% ({count}/{len(result)})')
+            
+            result = result.drop(columns=cols_to_drop)
+            print(f'删除后: {result.shape[0]} 行 × {result.shape[1]} 列')
+            print('')
+        else:
+            print(f'没有找到缺失率>{threshold*100:.0f}%的列')
+            print('')
+    
+    # 按行删除
+    if method in ('row', 'both'):
+        before_rows = len(result)
+        
+        if subset:
+            # 仅检查指定列
+            print(f'仅检查指定列的缺失值: {subset}')
+            result = result.dropna(subset=subset)
+        else:
+            # 检查所有列
+            result = result.dropna()
+        
+        dropped_rows = before_rows - len(result)
+        if dropped_rows > 0:
+            print(f'删除了 {dropped_rows} 行（包含缺失值的行）')
+            print(f'保留了 {len(result)} 行（{len(result)/before_rows*100:.1f}%）')
+        else:
+            print('没有找到包含缺失值的行')
+        print('')
+    
+    # 最终统计
+    final_shape = result.shape
+    print(f'最终结果: {final_shape[0]} 行 × {final_shape[1]} 列')
+    print(f'删除了 {original_shape[0] - final_shape[0]} 行')
+    print(f'删除了 {original_shape[1] - final_shape[1]} 列')
+    print(f'剩余缺失值: {result.isna().sum().sum()}')
+    
+    # 如果结果为空，给出警告
+    if len(result) == 0:
+        print('\n⚠️  警告: 删除后数据为空！')
+    
+    return result
+
+
+def get_missing_summary(df: pd.DataFrame) -> dict:
+    """
+    获取缺失值统计摘要
+    
+    Args:
+        df: 输入数据框
+    
+    Returns:
+        缺失值统计信息
+    """
+    total_cells = df.shape[0] * df.shape[1]
+    total_missing = df.isna().sum().sum()
+    
+    # 按列统计
+    col_missing = df.isna().sum()
+    col_missing_rate = col_missing / len(df)
+    
+    cols_with_missing = col_missing[col_missing > 0].to_dict()
+    cols_missing_rate = col_missing_rate[col_missing > 0].to_dict()
+    
+    # 按行统计
+    row_missing = df.isna().sum(axis=1)
+    rows_with_missing = (row_missing > 0).sum()
+    
+    return {
+        'total_cells': total_cells,
+        'total_missing': int(total_missing),
+        'missing_rate': total_missing / total_cells if total_cells > 0 else 0,
+        'rows_with_missing': int(rows_with_missing),
+        'cols_with_missing': len(cols_with_missing),
+        'col_missing_detail': {
+            col: {
+                'count': int(count),
+                'rate': float(cols_missing_rate[col])
+            }
+            for col, count in cols_with_missing.items()
+        }
+    }
+