feat(dc/tool-c): Add missing value imputation feature with 6 methods and MICE

Major features: 1. Missing value imputation (6 simple methods + MICE): - Mean/Median/Mode/Constant imputation - Forward fill (ffill) and Backward fill (bfill) for time series - MICE multivariate imputation (in progress, shape issue to fix) 2. Auto precision detection: - Automatically match decimal places of original data - Prevent false precision (e.g. 13.57 instead of 13.566716417910449) 3. Categorical variable detection: - Auto-detect and skip categorical columns in MICE - Show warnings for unsuitable columns - Suggest mode imputation for categorical data 4. UI improvements: - Rename button: "Delete Missing" to "Missing Value Handling" - Remove standalone "Dedup" and "MICE" buttons - 3-tab dialog: Delete / Fill / Advanced Fill - Display column statistics and recommended methods - Extended warning messages (8 seconds for skipped columns) 5. Bug fixes: - Fix sessionService.updateSessionData -> saveProcessedData - Fix OperationResult interface (add message and stats) - Fix Toolbar button labels and removal Modified files: Python: operations/fillna.py (new, 556 lines), main.py (3 new endpoints) Backend: QuickActionService.ts, QuickActionController.ts, routes/index.ts Frontend: MissingValueDialog.tsx (new, 437 lines), Toolbar.tsx, index.tsx Tests: test_fillna_operations.py (774 lines), test scripts and docs Docs: 5 documentation files updated Known issues: - MICE imputation has DataFrame shape mismatch issue (under debugging) - Workaround: Use 6 simple imputation methods first Status: Development complete, MICE debugging in progress Lines added: ~2000 lines across 3 tiers
2025-12-10 13:06:00 +08:00
parent f4f1d09837
commit 74cf346453
102 changed files with 3806 additions and 181 deletions
--- a/extraction_service/main.py
+++ b/extraction_service/main.py
@@ -70,6 +70,7 @@ from operations.conditional import apply_conditional_column, apply_simple_binnin
 from operations.dropna import drop_missing_values, get_missing_summary
 from operations.compute import compute_column, get_formula_examples
 from operations.pivot import pivot_long_to_wide, get_pivot_preview
+from operations.fillna import fillna_simple, fillna_mice, get_column_missing_stats


 # ==================== Pydantic Models ====================
@@ -148,6 +149,29 @@ class PivotRequest(BaseModel):
    pivot_value_order: List[str] = []  # ✨ 新增：透视列值的原始顺序


+class FillnaStatsRequest(BaseModel):
+    """获取列缺失值统计请求模型"""
+    data: List[Dict[str, Any]]
+    column: str
+
+
+class FillnaSimpleRequest(BaseModel):
+    """简单填补请求模型"""
+    data: List[Dict[str, Any]]
+    column: str
+    new_column_name: str
+    method: str  # 'mean', 'median', 'mode', 'constant', 'ffill', 'bfill'
+    fill_value: Any = None
+
+
+class FillnaMiceRequest(BaseModel):
+    """MICE多重插补请求模型"""
+    data: List[Dict[str, Any]]
+    columns: List[str]
+    n_iterations: int = 10
+    random_state: int = 42
+
+
 # ==================== API路由 ====================

@app.get("/")
@@ -1267,6 +1291,174 @@ async def operation_pivot(request: PivotRequest):
        }, status_code=400)


+@app.post("/api/operations/fillna-stats")
+async def operation_fillna_stats(request: FillnaStatsRequest):
+    """
+    获取列的缺失值统计信息
+    
+    Args:
+        request: FillnaStatsRequest
+            - data: 数据
+            - column: 列名
+    
+    Returns:
+        {
+            "success": bool,
+            "stats": Dict (缺失值统计信息),
+            "execution_time": float
+        }
+    """
+    try:
+        import pandas as pd
+        import time
+        
+        start_time = time.time()
+        
+        # 转换为DataFrame
+        df = pd.DataFrame(request.data)
+        
+        # 调用统计函数
+        stats = get_column_missing_stats(df, request.column)
+        
+        execution_time = time.time() - start_time
+        
+        logger.info(f"获取列 '{request.column}' 的缺失值统计成功")
+        
+        return JSONResponse(content={
+            "success": True,
+            "stats": stats,
+            "execution_time": execution_time
+        })
+        
+    except Exception as e:
+        logger.error(f"获取缺失值统计失败: {str(e)}")
+        return JSONResponse(content={
+            "success": False,
+            "error": str(e),
+            "execution_time": time.time() - start_time if 'start_time' in locals() else 0
+        }, status_code=400)
+
+
+@app.post("/api/operations/fillna-simple")
+async def operation_fillna_simple(request: FillnaSimpleRequest):
+    """
+    简单填补缺失值（均值、中位数、众数、固定值、前向、后向）
+    
+    Args:
+        request: FillnaSimpleRequest
+            - data: 数据
+            - column: 原始列名
+            - new_column_name: 新列名
+            - method: 填补方法
+            - fill_value: 固定值（method='constant'时使用）
+    
+    Returns:
+        {
+            "success": bool,
+            "result_data": List[Dict],
+            "stats": Dict (填补统计信息),
+            "message": str,
+            "execution_time": float
+        }
+    """
+    try:
+        import pandas as pd
+        import time
+        
+        start_time = time.time()
+        
+        # 转换为DataFrame
+        df = pd.DataFrame(request.data)
+        
+        # 调用填补函数
+        result = fillna_simple(
+            df,
+            request.column,
+            request.new_column_name,
+            request.method,
+            request.fill_value
+        )
+        
+        execution_time = time.time() - start_time
+        
+        logger.info(f"简单填补成功: {request.method} on '{request.column}'")
+        
+        return JSONResponse(content={
+            "success": result['success'],
+            "result_data": result['result_data'],
+            "stats": result['stats'],
+            "message": result['message'],
+            "execution_time": execution_time
+        })
+        
+    except Exception as e:
+        logger.error(f"简单填补失败: {str(e)}")
+        return JSONResponse(content={
+            "success": False,
+            "error": str(e),
+            "execution_time": time.time() - start_time if 'start_time' in locals() else 0
+        }, status_code=400)
+
+
+@app.post("/api/operations/fillna-mice")
+async def operation_fillna_mice(request: FillnaMiceRequest):
+    """
+    MICE多重插补
+    
+    Args:
+        request: FillnaMiceRequest
+            - data: 数据
+            - columns: 要填补的列名列表
+            - n_iterations: 迭代次数
+            - random_state: 随机种子
+    
+    Returns:
+        {
+            "success": bool,
+            "result_data": List[Dict],
+            "stats": Dict (各列的填补统计信息),
+            "message": str,
+            "execution_time": float
+        }
+    """
+    try:
+        import pandas as pd
+        import time
+        
+        start_time = time.time()
+        
+        # 转换为DataFrame
+        df = pd.DataFrame(request.data)
+        
+        # 调用MICE填补函数
+        result = fillna_mice(
+            df,
+            request.columns,
+            request.n_iterations,
+            request.random_state
+        )
+        
+        execution_time = time.time() - start_time
+        
+        logger.info(f"MICE填补成功: {len(request.columns)} 列")
+        
+        return JSONResponse(content={
+            "success": result['success'],
+            "result_data": result['result_data'],
+            "stats": result['stats'],
+            "message": result['message'],
+            "execution_time": execution_time
+        })
+        
+    except Exception as e:
+        logger.error(f"MICE填补失败: {str(e)}")
+        return JSONResponse(content={
+            "success": False,
+            "error": str(e),
+            "execution_time": time.time() - start_time if 'start_time' in locals() else 0
+        }, status_code=400)
+
+
 # ==================== 启动配置 ====================

 if __name__ == "__main__":