feat(dc/tool-c): Add missing value imputation feature with 6 methods and MICE

Major features:
1. Missing value imputation (6 simple methods + MICE):
   - Mean/Median/Mode/Constant imputation
   - Forward fill (ffill) and Backward fill (bfill) for time series
   - MICE multivariate imputation (in progress, shape issue to fix)

2. Auto precision detection:
   - Automatically match decimal places of original data
   - Prevent false precision (e.g. 13.57 instead of 13.566716417910449)

3. Categorical variable detection:
   - Auto-detect and skip categorical columns in MICE
   - Show warnings for unsuitable columns
   - Suggest mode imputation for categorical data

4. UI improvements:
   - Rename button: "Delete Missing" to "Missing Value Handling"
   - Remove standalone "Dedup" and "MICE" buttons
   - 3-tab dialog: Delete / Fill / Advanced Fill
   - Display column statistics and recommended methods
   - Extended warning messages (8 seconds for skipped columns)

5. Bug fixes:
   - Fix sessionService.updateSessionData -> saveProcessedData
   - Fix OperationResult interface (add message and stats)
   - Fix Toolbar button labels and removal

Modified files:
Python: operations/fillna.py (new, 556 lines), main.py (3 new endpoints)
Backend: QuickActionService.ts, QuickActionController.ts, routes/index.ts
Frontend: MissingValueDialog.tsx (new, 437 lines), Toolbar.tsx, index.tsx
Tests: test_fillna_operations.py (774 lines), test scripts and docs
Docs: 5 documentation files updated

Known issues:
- MICE imputation has DataFrame shape mismatch issue (under debugging)
- Workaround: Use 6 simple imputation methods first

Status: Development complete, MICE debugging in progress
Lines added: ~2000 lines across 3 tiers
This commit is contained in:
2025-12-10 13:06:00 +08:00
parent f4f1d09837
commit 74cf346453
102 changed files with 3806 additions and 181 deletions

View File

@@ -517,4 +517,6 @@ export default FulltextDetailDrawer;

View File

@@ -116,4 +116,6 @@ export function useFulltextResults({

View File

@@ -79,4 +79,6 @@ export function useFulltextTask({

View File

@@ -470,4 +470,6 @@ export default FulltextResults;

View File

@@ -111,3 +111,5 @@ export const useAssets = (activeTab: AssetTabType) => {

View File

@@ -101,3 +101,5 @@ export const useRecentTasks = () => {

View File

@@ -337,3 +337,5 @@ export default BinningDialog;

View File

@@ -300,3 +300,5 @@ export default DropnaDialog;

View File

@@ -0,0 +1,436 @@
import React, { useState, useEffect } from 'react';
import { Modal, Tabs, Radio, Select, Input, Checkbox, Alert, App, Row, Col, InputNumber, Space } from 'antd';
interface Props {
visible: boolean;
onClose: () => void;
onApply: (newData: any[]) => void;
columns: Array<{ id: string; name: string }>;
sessionId: string | null;
}
const MissingValueDialog: React.FC<Props> = ({
visible,
onClose,
onApply,
columns,
sessionId,
}) => {
const { message } = App.useApp();
const [activeTab, setActiveTab] = useState('delete');
const [loading, setLoading] = useState(false);
// Tab 1: 删除相关状态
const [deleteMethod, setDeleteMethod] = useState<'row' | 'column'>('row');
const [threshold, setThreshold] = useState(50);
const [selectedColumns] = useState<string[]>([]);
// Tab 2: 简单填补相关状态
const [selectedColumn, setSelectedColumn] = useState('');
const [newColumnName, setNewColumnName] = useState('');
const [fillMethod, setFillMethod] = useState<'mean' | 'median' | 'mode' | 'constant' | 'ffill' | 'bfill'>('median');
const [fillValue, setFillValue] = useState<any>('');
const [columnStats, setColumnStats] = useState<any>(null);
// Tab 3: MICE相关状态
const [miceColumns, setMiceColumns] = useState<string[]>([]);
const [nIterations, setNIterations] = useState(10);
const [randomState, setRandomState] = useState(42);
// 当选择列变化时,自动生成新列名和获取统计
useEffect(() => {
const fetchColumnStats = async () => {
if (!sessionId || !selectedColumn) return;
try {
const response = await fetch('/api/v1/dc/tool-c/fillna/stats', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
sessionId,
column: selectedColumn
})
});
const result = await response.json();
if (result.success) {
setColumnStats(result.stats);
}
} catch (error) {
console.error('获取统计信息失败:', error);
}
};
if (selectedColumn && activeTab === 'fill') {
setNewColumnName(`${selectedColumn}_填补`);
fetchColumnStats();
}
}, [selectedColumn, activeTab, sessionId]);
// 当统计信息更新时,自动设置推荐的填补方法
useEffect(() => {
if (columnStats && columnStats.recommended_method) {
setFillMethod(columnStats.recommended_method as any);
}
}, [columnStats]);
// 执行删除
const handleDelete = async () => {
if (!sessionId) {
message.error('Session ID不存在');
return;
}
setLoading(true);
try {
const response = await fetch('/api/v1/dc/tool-c/quick-action', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
sessionId,
action: 'dropna',
params: {
method: deleteMethod,
threshold,
columns: selectedColumns
}
})
});
const result = await response.json();
if (result.success) {
message.success('删除成功');
onApply(result.data.newDataPreview);
onClose();
} else {
message.error(result.error || '删除失败');
}
} catch (error: any) {
message.error(error.message || '删除失败');
} finally {
setLoading(false);
}
};
// 执行简单填补
const handleFillSimple = async () => {
if (!sessionId || !selectedColumn) {
message.error('请选择列');
return;
}
if (!newColumnName) {
message.error('请输入新列名');
return;
}
if (fillMethod === 'constant' && !fillValue) {
message.error('请输入填补值');
return;
}
setLoading(true);
try {
const response = await fetch('/api/v1/dc/tool-c/fillna/simple', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
sessionId,
column: selectedColumn,
newColumnName,
method: fillMethod,
fillValue: fillMethod === 'constant' ? fillValue : undefined
})
});
const result = await response.json();
if (result.success) {
message.success(result.data.message || '填补成功');
onApply(result.data.newDataPreview);
onClose();
} else {
message.error(result.error || '填补失败');
}
} catch (error: any) {
message.error(error.message || '填补失败');
} finally {
setLoading(false);
}
};
// 执行MICE填补
const handleFillMice = async () => {
if (!sessionId) {
message.error('Session ID不存在');
return;
}
if (miceColumns.length === 0) {
message.error('请至少选择一列');
return;
}
setLoading(true);
try {
const response = await fetch('/api/v1/dc/tool-c/fillna/mice', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
sessionId,
columns: miceColumns,
nIterations,
randomState
})
});
const result = await response.json();
if (result.success) {
const msg = result.data.message || 'MICE填补成功';
// 如果消息中包含"跳过"使用warning而不是success并延长显示时间
if (msg.includes('跳过') || msg.includes('分类变量')) {
message.warning(msg, 8); // 显示8秒
} else {
message.success(msg, 4); // 显示4秒
}
onApply(result.data.newDataPreview);
onClose();
} else {
message.error(result.error || 'MICE填补失败');
}
} catch (error: any) {
message.error(error.message || 'MICE填补失败');
} finally {
setLoading(false);
}
};
const handleOk = () => {
if (activeTab === 'delete') {
handleDelete();
} else if (activeTab === 'fill') {
handleFillSimple();
} else if (activeTab === 'mice') {
handleFillMice();
}
};
return (
<Modal
title="缺失值处理"
open={visible}
onCancel={onClose}
onOk={handleOk}
okText={activeTab === 'delete' ? '执行删除' : activeTab === 'fill' ? '执行填补' : '执行MICE填补'}
cancelText="取消"
width={680}
confirmLoading={loading}
>
<Tabs
activeKey={activeTab}
onChange={setActiveTab}
items={[
{
key: 'delete',
label: '删除',
children: (
<div className="space-y-4">
<Alert
message="删除含有缺失值的行或列(原始数据将被修改)"
type="warning"
showIcon
/>
<div>
<div className="mb-2 font-medium"></div>
<Radio.Group value={deleteMethod} onChange={(e) => setDeleteMethod(e.target.value)}>
<Space direction="vertical">
<Radio value="row"></Radio>
<Radio value="column">{threshold}%</Radio>
</Space>
</Radio.Group>
</div>
{deleteMethod === 'column' && (
<div>
<div className="mb-2">{threshold}%</div>
<InputNumber
min={0}
max={100}
value={threshold}
onChange={(val) => setThreshold(val || 50)}
style={{ width: '100%' }}
/>
</div>
)}
</div>
),
},
{
key: 'fill',
label: '填补',
children: (
<div className="space-y-4">
<Alert
message="💡 新列将创建在原列旁边,便于对比"
type="info"
showIcon
/>
<div>
<div className="mb-2 font-medium"></div>
<Select
value={selectedColumn}
onChange={setSelectedColumn}
placeholder="选择要填补的列"
style={{ width: '100%' }}
options={columns.map(col => ({
label: col.name,
value: col.name
}))}
/>
</div>
<div>
<div className="mb-2 font-medium"></div>
<Input
value={newColumnName}
onChange={(e) => setNewColumnName(e.target.value)}
placeholder="如体重_填补"
/>
</div>
{columnStats && (
<div className="p-3 bg-gray-50 rounded">
<div className="text-sm space-y-1">
<div> {columnStats.missing_count}{columnStats.missing_rate}%</div>
<div> {columnStats.valid_count}</div>
{columnStats.mean !== null && <div> {columnStats.mean?.toFixed(2)}</div>}
{columnStats.median !== null && <div> {columnStats.median?.toFixed(2)}</div>}
<div> <span className="text-blue-600">
{columnStats.recommended_method === 'mean' && '均值填补'}
{columnStats.recommended_method === 'median' && '中位数填补'}
{columnStats.recommended_method === 'mode' && '众数填补'}
{columnStats.recommended_method === 'constant' && '固定值填补'}
{columnStats.recommended_method === 'ffill' && '前向填充'}
{columnStats.recommended_method === 'bfill' && '后向填充'}
</span></div>
</div>
</div>
)}
<div>
<div className="mb-2 font-medium"></div>
<Radio.Group value={fillMethod} onChange={(e) => setFillMethod(e.target.value)}>
<Space direction="vertical">
<Radio value="mean"></Radio>
<Radio value="median"></Radio>
<Radio value="mode"></Radio>
<Radio value="constant"></Radio>
<Radio value="ffill"></Radio>
<Radio value="bfill"></Radio>
</Space>
</Radio.Group>
</div>
{fillMethod === 'constant' && (
<div>
<div className="mb-2 font-medium"></div>
<Input
value={fillValue}
onChange={(e) => setFillValue(e.target.value)}
placeholder="输入固定值"
/>
</div>
)}
</div>
),
},
{
key: 'mice',
label: '高级填补',
children: (
<div className="space-y-4">
<Alert
message="⭐ MICE多重插补 - 医学研究高质量填补的首选方法"
type="success"
showIcon
description="MICE会根据其他变量的值来预测缺失值适合缺失率5%-30%、需要考虑变量间相关性的场景。"
/>
<Alert
message="⚠️ 重要MICE仅适用于数值列"
type="warning"
showIcon
description={
<div className="text-sm">
<div> </div>
<div> 使"众数填补"</div>
<div className="mt-2 text-orange-600"></div>
</div>
}
/>
<div>
<div className="mb-2 font-medium"></div>
<Checkbox.Group
value={miceColumns}
onChange={setMiceColumns}
style={{ width: '100%' }}
>
<Space direction="vertical">
{columns.map(col => (
<Checkbox key={col.id} value={col.name}>
{col.name}
</Checkbox>
))}
</Space>
</Checkbox.Group>
</div>
<div className="p-3 bg-blue-50 rounded">
<div className="text-sm space-y-1">
<div> + "_MICE"</div>
<div> </div>
<div> 101</div>
</div>
</div>
<Row gutter={16}>
<Col span={12}>
<div className="mb-2"></div>
<InputNumber
min={5}
max={50}
value={nIterations}
onChange={(val) => setNIterations(val || 10)}
style={{ width: '100%' }}
/>
</Col>
<Col span={12}>
<div className="mb-2"></div>
<InputNumber
value={randomState}
onChange={(val) => setRandomState(val || 42)}
style={{ width: '100%' }}
/>
</Col>
</Row>
</div>
),
},
]}
/>
</Modal>
);
};
export default MissingValueDialog;

View File

@@ -113,7 +113,7 @@ const Toolbar: React.FC<ToolbarProps> = ({
/>
<ToolbarButton
icon={Trash2}
label="删除缺失值"
label="缺失值处理"
colorClass="text-red-600 bg-red-50 hover:bg-red-100"
onClick={onDropnaClick}
disabled={!sessionId}
@@ -125,13 +125,6 @@ const Toolbar: React.FC<ToolbarProps> = ({
onClick={onComputeClick}
disabled={!sessionId}
/>
<ToolbarButton
icon={FileSearch}
label="去重"
colorClass="text-orange-600 bg-orange-50 hover:bg-orange-100"
onClick={onDedupClick}
disabled={true}
/>
<div className="w-[1px] h-8 bg-slate-200 mx-2"></div>
@@ -143,13 +136,6 @@ const Toolbar: React.FC<ToolbarProps> = ({
onClick={onPivotClick}
disabled={!sessionId}
/>
<ToolbarButton
icon={CalendarClock}
label="多重插补"
colorClass="text-rose-600 bg-rose-50 hover:bg-rose-100"
onClick={onMiceClick}
disabled={true}
/>
<div className="flex-1"></div>

View File

@@ -14,7 +14,7 @@ import FilterDialog from './components/FilterDialog';
import RecodeDialog from './components/RecodeDialog';
import BinningDialog from './components/BinningDialog';
import ConditionalDialog from './components/ConditionalDialog';
import DropnaDialog from './components/DropnaDialog';
import MissingValueDialog from './components/MissingValueDialog';
import ComputeDialog from './components/ComputeDialog';
import PivotDialog from './components/PivotDialog';
import * as api from '../../api/toolC';
@@ -342,7 +342,7 @@ const ToolC = () => {
onApply={handleQuickActionDataUpdate}
/>
<DropnaDialog
<MissingValueDialog
visible={state.dropnaDialogVisible}
columns={state.columns}
data={state.data}

View File

@@ -63,3 +63,5 @@ export interface DataStats {

View File

@@ -59,3 +59,5 @@ export type AssetTabType = 'all' | 'processed' | 'raw';

View File

@@ -14,3 +14,5 @@ export { default as Placeholder } from './Placeholder';