feat(dc/tool-c): Add missing value imputation feature with 6 methods and MICE

Major features:
1. Missing value imputation (6 simple methods + MICE):
   - Mean/Median/Mode/Constant imputation
   - Forward fill (ffill) and Backward fill (bfill) for time series
   - MICE multivariate imputation (in progress, shape issue to fix)

2. Auto precision detection:
   - Automatically match decimal places of original data
   - Prevent false precision (e.g. 13.57 instead of 13.566716417910449)

3. Categorical variable detection:
   - Auto-detect and skip categorical columns in MICE
   - Show warnings for unsuitable columns
   - Suggest mode imputation for categorical data

4. UI improvements:
   - Rename button: "Delete Missing" to "Missing Value Handling"
   - Remove standalone "Dedup" and "MICE" buttons
   - 3-tab dialog: Delete / Fill / Advanced Fill
   - Display column statistics and recommended methods
   - Extended warning messages (8 seconds for skipped columns)

5. Bug fixes:
   - Fix sessionService.updateSessionData -> saveProcessedData
   - Fix OperationResult interface (add message and stats)
   - Fix Toolbar button labels and removal

Modified files:
Python: operations/fillna.py (new, 556 lines), main.py (3 new endpoints)
Backend: QuickActionService.ts, QuickActionController.ts, routes/index.ts
Frontend: MissingValueDialog.tsx (new, 437 lines), Toolbar.tsx, index.tsx
Tests: test_fillna_operations.py (774 lines), test scripts and docs
Docs: 5 documentation files updated

Known issues:
- MICE imputation has DataFrame shape mismatch issue (under debugging)
- Workaround: Use 6 simple imputation methods first

Status: Development complete, MICE debugging in progress
Lines added: ~2000 lines across 3 tiers
This commit is contained in:
2025-12-10 13:06:00 +08:00
parent f4f1d09837
commit 74cf346453
102 changed files with 3806 additions and 181 deletions

View File

@@ -70,6 +70,7 @@ from operations.conditional import apply_conditional_column, apply_simple_binnin
from operations.dropna import drop_missing_values, get_missing_summary
from operations.compute import compute_column, get_formula_examples
from operations.pivot import pivot_long_to_wide, get_pivot_preview
from operations.fillna import fillna_simple, fillna_mice, get_column_missing_stats
# ==================== Pydantic Models ====================
@@ -148,6 +149,29 @@ class PivotRequest(BaseModel):
pivot_value_order: List[str] = [] # ✨ 新增:透视列值的原始顺序
class FillnaStatsRequest(BaseModel):
"""获取列缺失值统计请求模型"""
data: List[Dict[str, Any]]
column: str
class FillnaSimpleRequest(BaseModel):
"""简单填补请求模型"""
data: List[Dict[str, Any]]
column: str
new_column_name: str
method: str # 'mean', 'median', 'mode', 'constant', 'ffill', 'bfill'
fill_value: Any = None
class FillnaMiceRequest(BaseModel):
"""MICE多重插补请求模型"""
data: List[Dict[str, Any]]
columns: List[str]
n_iterations: int = 10
random_state: int = 42
# ==================== API路由 ====================
@app.get("/")
@@ -1267,6 +1291,174 @@ async def operation_pivot(request: PivotRequest):
}, status_code=400)
@app.post("/api/operations/fillna-stats")
async def operation_fillna_stats(request: FillnaStatsRequest):
"""
获取列的缺失值统计信息
Args:
request: FillnaStatsRequest
- data: 数据
- column: 列名
Returns:
{
"success": bool,
"stats": Dict (缺失值统计信息),
"execution_time": float
}
"""
try:
import pandas as pd
import time
start_time = time.time()
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用统计函数
stats = get_column_missing_stats(df, request.column)
execution_time = time.time() - start_time
logger.info(f"获取列 '{request.column}' 的缺失值统计成功")
return JSONResponse(content={
"success": True,
"stats": stats,
"execution_time": execution_time
})
except Exception as e:
logger.error(f"获取缺失值统计失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/fillna-simple")
async def operation_fillna_simple(request: FillnaSimpleRequest):
"""
简单填补缺失值(均值、中位数、众数、固定值、前向、后向)
Args:
request: FillnaSimpleRequest
- data: 数据
- column: 原始列名
- new_column_name: 新列名
- method: 填补方法
- fill_value: 固定值method='constant'时使用)
Returns:
{
"success": bool,
"result_data": List[Dict],
"stats": Dict (填补统计信息),
"message": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
start_time = time.time()
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用填补函数
result = fillna_simple(
df,
request.column,
request.new_column_name,
request.method,
request.fill_value
)
execution_time = time.time() - start_time
logger.info(f"简单填补成功: {request.method} on '{request.column}'")
return JSONResponse(content={
"success": result['success'],
"result_data": result['result_data'],
"stats": result['stats'],
"message": result['message'],
"execution_time": execution_time
})
except Exception as e:
logger.error(f"简单填补失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/fillna-mice")
async def operation_fillna_mice(request: FillnaMiceRequest):
"""
MICE多重插补
Args:
request: FillnaMiceRequest
- data: 数据
- columns: 要填补的列名列表
- n_iterations: 迭代次数
- random_state: 随机种子
Returns:
{
"success": bool,
"result_data": List[Dict],
"stats": Dict (各列的填补统计信息),
"message": str,
"execution_time": float
}
"""
try:
import pandas as pd
import time
start_time = time.time()
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 调用MICE填补函数
result = fillna_mice(
df,
request.columns,
request.n_iterations,
request.random_state
)
execution_time = time.time() - start_time
logger.info(f"MICE填补成功: {len(request.columns)}")
return JSONResponse(content={
"success": result['success'],
"result_data": result['result_data'],
"stats": result['stats'],
"message": result['message'],
"execution_time": execution_time
})
except Exception as e:
logger.error(f"MICE填补失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
# ==================== 启动配置 ====================
if __name__ == "__main__":