feat(dc/tool-c): Add missing value imputation feature with 6 methods and MICE
Major features: 1. Missing value imputation (6 simple methods + MICE): - Mean/Median/Mode/Constant imputation - Forward fill (ffill) and Backward fill (bfill) for time series - MICE multivariate imputation (in progress, shape issue to fix) 2. Auto precision detection: - Automatically match decimal places of original data - Prevent false precision (e.g. 13.57 instead of 13.566716417910449) 3. Categorical variable detection: - Auto-detect and skip categorical columns in MICE - Show warnings for unsuitable columns - Suggest mode imputation for categorical data 4. UI improvements: - Rename button: "Delete Missing" to "Missing Value Handling" - Remove standalone "Dedup" and "MICE" buttons - 3-tab dialog: Delete / Fill / Advanced Fill - Display column statistics and recommended methods - Extended warning messages (8 seconds for skipped columns) 5. Bug fixes: - Fix sessionService.updateSessionData -> saveProcessedData - Fix OperationResult interface (add message and stats) - Fix Toolbar button labels and removal Modified files: Python: operations/fillna.py (new, 556 lines), main.py (3 new endpoints) Backend: QuickActionService.ts, QuickActionController.ts, routes/index.ts Frontend: MissingValueDialog.tsx (new, 437 lines), Toolbar.tsx, index.tsx Tests: test_fillna_operations.py (774 lines), test scripts and docs Docs: 5 documentation files updated Known issues: - MICE imputation has DataFrame shape mismatch issue (under debugging) - Workaround: Use 6 simple imputation methods first Status: Development complete, MICE debugging in progress Lines added: ~2000 lines across 3 tiers
This commit is contained in:
@@ -517,4 +517,6 @@ export default FulltextDetailDrawer;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -116,4 +116,6 @@ export function useFulltextResults({
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -79,4 +79,6 @@ export function useFulltextTask({
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -470,4 +470,6 @@ export default FulltextResults;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -111,3 +111,5 @@ export const useAssets = (activeTab: AssetTabType) => {
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -101,3 +101,5 @@ export const useRecentTasks = () => {
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -337,3 +337,5 @@ export default BinningDialog;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -300,3 +300,5 @@ export default DropnaDialog;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,436 @@
|
||||
import React, { useState, useEffect } from 'react';
|
||||
import { Modal, Tabs, Radio, Select, Input, Checkbox, Alert, App, Row, Col, InputNumber, Space } from 'antd';
|
||||
|
||||
interface Props {
|
||||
visible: boolean;
|
||||
onClose: () => void;
|
||||
onApply: (newData: any[]) => void;
|
||||
columns: Array<{ id: string; name: string }>;
|
||||
sessionId: string | null;
|
||||
}
|
||||
|
||||
const MissingValueDialog: React.FC<Props> = ({
|
||||
visible,
|
||||
onClose,
|
||||
onApply,
|
||||
columns,
|
||||
sessionId,
|
||||
}) => {
|
||||
const { message } = App.useApp();
|
||||
const [activeTab, setActiveTab] = useState('delete');
|
||||
const [loading, setLoading] = useState(false);
|
||||
|
||||
// Tab 1: 删除相关状态
|
||||
const [deleteMethod, setDeleteMethod] = useState<'row' | 'column'>('row');
|
||||
const [threshold, setThreshold] = useState(50);
|
||||
const [selectedColumns] = useState<string[]>([]);
|
||||
|
||||
// Tab 2: 简单填补相关状态
|
||||
const [selectedColumn, setSelectedColumn] = useState('');
|
||||
const [newColumnName, setNewColumnName] = useState('');
|
||||
const [fillMethod, setFillMethod] = useState<'mean' | 'median' | 'mode' | 'constant' | 'ffill' | 'bfill'>('median');
|
||||
const [fillValue, setFillValue] = useState<any>('');
|
||||
const [columnStats, setColumnStats] = useState<any>(null);
|
||||
|
||||
// Tab 3: MICE相关状态
|
||||
const [miceColumns, setMiceColumns] = useState<string[]>([]);
|
||||
const [nIterations, setNIterations] = useState(10);
|
||||
const [randomState, setRandomState] = useState(42);
|
||||
|
||||
// 当选择列变化时,自动生成新列名和获取统计
|
||||
useEffect(() => {
|
||||
const fetchColumnStats = async () => {
|
||||
if (!sessionId || !selectedColumn) return;
|
||||
|
||||
try {
|
||||
const response = await fetch('/api/v1/dc/tool-c/fillna/stats', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
sessionId,
|
||||
column: selectedColumn
|
||||
})
|
||||
});
|
||||
|
||||
const result = await response.json();
|
||||
if (result.success) {
|
||||
setColumnStats(result.stats);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('获取统计信息失败:', error);
|
||||
}
|
||||
};
|
||||
|
||||
if (selectedColumn && activeTab === 'fill') {
|
||||
setNewColumnName(`${selectedColumn}_填补`);
|
||||
fetchColumnStats();
|
||||
}
|
||||
}, [selectedColumn, activeTab, sessionId]);
|
||||
|
||||
// 当统计信息更新时,自动设置推荐的填补方法
|
||||
useEffect(() => {
|
||||
if (columnStats && columnStats.recommended_method) {
|
||||
setFillMethod(columnStats.recommended_method as any);
|
||||
}
|
||||
}, [columnStats]);
|
||||
|
||||
// 执行删除
|
||||
const handleDelete = async () => {
|
||||
if (!sessionId) {
|
||||
message.error('Session ID不存在');
|
||||
return;
|
||||
}
|
||||
|
||||
setLoading(true);
|
||||
|
||||
try {
|
||||
const response = await fetch('/api/v1/dc/tool-c/quick-action', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
sessionId,
|
||||
action: 'dropna',
|
||||
params: {
|
||||
method: deleteMethod,
|
||||
threshold,
|
||||
columns: selectedColumns
|
||||
}
|
||||
})
|
||||
});
|
||||
|
||||
const result = await response.json();
|
||||
|
||||
if (result.success) {
|
||||
message.success('删除成功');
|
||||
onApply(result.data.newDataPreview);
|
||||
onClose();
|
||||
} else {
|
||||
message.error(result.error || '删除失败');
|
||||
}
|
||||
} catch (error: any) {
|
||||
message.error(error.message || '删除失败');
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
// 执行简单填补
|
||||
const handleFillSimple = async () => {
|
||||
if (!sessionId || !selectedColumn) {
|
||||
message.error('请选择列');
|
||||
return;
|
||||
}
|
||||
|
||||
if (!newColumnName) {
|
||||
message.error('请输入新列名');
|
||||
return;
|
||||
}
|
||||
|
||||
if (fillMethod === 'constant' && !fillValue) {
|
||||
message.error('请输入填补值');
|
||||
return;
|
||||
}
|
||||
|
||||
setLoading(true);
|
||||
|
||||
try {
|
||||
const response = await fetch('/api/v1/dc/tool-c/fillna/simple', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
sessionId,
|
||||
column: selectedColumn,
|
||||
newColumnName,
|
||||
method: fillMethod,
|
||||
fillValue: fillMethod === 'constant' ? fillValue : undefined
|
||||
})
|
||||
});
|
||||
|
||||
const result = await response.json();
|
||||
|
||||
if (result.success) {
|
||||
message.success(result.data.message || '填补成功');
|
||||
onApply(result.data.newDataPreview);
|
||||
onClose();
|
||||
} else {
|
||||
message.error(result.error || '填补失败');
|
||||
}
|
||||
} catch (error: any) {
|
||||
message.error(error.message || '填补失败');
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
// 执行MICE填补
|
||||
const handleFillMice = async () => {
|
||||
if (!sessionId) {
|
||||
message.error('Session ID不存在');
|
||||
return;
|
||||
}
|
||||
|
||||
if (miceColumns.length === 0) {
|
||||
message.error('请至少选择一列');
|
||||
return;
|
||||
}
|
||||
|
||||
setLoading(true);
|
||||
|
||||
try {
|
||||
const response = await fetch('/api/v1/dc/tool-c/fillna/mice', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
sessionId,
|
||||
columns: miceColumns,
|
||||
nIterations,
|
||||
randomState
|
||||
})
|
||||
});
|
||||
|
||||
const result = await response.json();
|
||||
|
||||
if (result.success) {
|
||||
const msg = result.data.message || 'MICE填补成功';
|
||||
|
||||
// 如果消息中包含"跳过",使用warning而不是success,并延长显示时间
|
||||
if (msg.includes('跳过') || msg.includes('分类变量')) {
|
||||
message.warning(msg, 8); // 显示8秒
|
||||
} else {
|
||||
message.success(msg, 4); // 显示4秒
|
||||
}
|
||||
|
||||
onApply(result.data.newDataPreview);
|
||||
onClose();
|
||||
} else {
|
||||
message.error(result.error || 'MICE填补失败');
|
||||
}
|
||||
} catch (error: any) {
|
||||
message.error(error.message || 'MICE填补失败');
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
const handleOk = () => {
|
||||
if (activeTab === 'delete') {
|
||||
handleDelete();
|
||||
} else if (activeTab === 'fill') {
|
||||
handleFillSimple();
|
||||
} else if (activeTab === 'mice') {
|
||||
handleFillMice();
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<Modal
|
||||
title="缺失值处理"
|
||||
open={visible}
|
||||
onCancel={onClose}
|
||||
onOk={handleOk}
|
||||
okText={activeTab === 'delete' ? '执行删除' : activeTab === 'fill' ? '执行填补' : '执行MICE填补'}
|
||||
cancelText="取消"
|
||||
width={680}
|
||||
confirmLoading={loading}
|
||||
>
|
||||
<Tabs
|
||||
activeKey={activeTab}
|
||||
onChange={setActiveTab}
|
||||
items={[
|
||||
{
|
||||
key: 'delete',
|
||||
label: '删除',
|
||||
children: (
|
||||
<div className="space-y-4">
|
||||
<Alert
|
||||
message="删除含有缺失值的行或列(原始数据将被修改)"
|
||||
type="warning"
|
||||
showIcon
|
||||
/>
|
||||
|
||||
<div>
|
||||
<div className="mb-2 font-medium">删除方式:</div>
|
||||
<Radio.Group value={deleteMethod} onChange={(e) => setDeleteMethod(e.target.value)}>
|
||||
<Space direction="vertical">
|
||||
<Radio value="row">删除含有缺失值的行</Radio>
|
||||
<Radio value="column">删除缺失率过高的列(阈值:{threshold}%)</Radio>
|
||||
</Space>
|
||||
</Radio.Group>
|
||||
</div>
|
||||
|
||||
{deleteMethod === 'column' && (
|
||||
<div>
|
||||
<div className="mb-2">缺失率阈值:{threshold}%</div>
|
||||
<InputNumber
|
||||
min={0}
|
||||
max={100}
|
||||
value={threshold}
|
||||
onChange={(val) => setThreshold(val || 50)}
|
||||
style={{ width: '100%' }}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
),
|
||||
},
|
||||
{
|
||||
key: 'fill',
|
||||
label: '填补',
|
||||
children: (
|
||||
<div className="space-y-4">
|
||||
<Alert
|
||||
message="💡 新列将创建在原列旁边,便于对比"
|
||||
type="info"
|
||||
showIcon
|
||||
/>
|
||||
|
||||
<div>
|
||||
<div className="mb-2 font-medium">原始列:</div>
|
||||
<Select
|
||||
value={selectedColumn}
|
||||
onChange={setSelectedColumn}
|
||||
placeholder="选择要填补的列"
|
||||
style={{ width: '100%' }}
|
||||
options={columns.map(col => ({
|
||||
label: col.name,
|
||||
value: col.name
|
||||
}))}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<div className="mb-2 font-medium">新列名:</div>
|
||||
<Input
|
||||
value={newColumnName}
|
||||
onChange={(e) => setNewColumnName(e.target.value)}
|
||||
placeholder="如:体重_填补"
|
||||
/>
|
||||
</div>
|
||||
|
||||
{columnStats && (
|
||||
<div className="p-3 bg-gray-50 rounded">
|
||||
<div className="text-sm space-y-1">
|
||||
<div>• 缺失:{columnStats.missing_count}个({columnStats.missing_rate}%)</div>
|
||||
<div>• 有效值:{columnStats.valid_count}个</div>
|
||||
{columnStats.mean !== null && <div>• 均值:{columnStats.mean?.toFixed(2)}</div>}
|
||||
{columnStats.median !== null && <div>• 中位数:{columnStats.median?.toFixed(2)}</div>}
|
||||
<div>• 推荐方法:<span className="text-blue-600">
|
||||
{columnStats.recommended_method === 'mean' && '均值填补'}
|
||||
{columnStats.recommended_method === 'median' && '中位数填补'}
|
||||
{columnStats.recommended_method === 'mode' && '众数填补'}
|
||||
{columnStats.recommended_method === 'constant' && '固定值填补'}
|
||||
{columnStats.recommended_method === 'ffill' && '前向填充'}
|
||||
{columnStats.recommended_method === 'bfill' && '后向填充'}
|
||||
</span></div>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
<div>
|
||||
<div className="mb-2 font-medium">填补方法:</div>
|
||||
<Radio.Group value={fillMethod} onChange={(e) => setFillMethod(e.target.value)}>
|
||||
<Space direction="vertical">
|
||||
<Radio value="mean">均值填补(适合正态分布)</Radio>
|
||||
<Radio value="median">中位数填补(适合偏态分布)</Radio>
|
||||
<Radio value="mode">众数填补(适合分类变量)</Radio>
|
||||
<Radio value="constant">固定值填补</Radio>
|
||||
<Radio value="ffill">前向填充(时间序列)</Radio>
|
||||
<Radio value="bfill">后向填充(时间序列)</Radio>
|
||||
</Space>
|
||||
</Radio.Group>
|
||||
</div>
|
||||
|
||||
{fillMethod === 'constant' && (
|
||||
<div>
|
||||
<div className="mb-2 font-medium">填补值:</div>
|
||||
<Input
|
||||
value={fillValue}
|
||||
onChange={(e) => setFillValue(e.target.value)}
|
||||
placeholder="输入固定值"
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
),
|
||||
},
|
||||
{
|
||||
key: 'mice',
|
||||
label: '高级填补',
|
||||
children: (
|
||||
<div className="space-y-4">
|
||||
<Alert
|
||||
message="⭐ MICE多重插补 - 医学研究高质量填补的首选方法"
|
||||
type="success"
|
||||
showIcon
|
||||
description="MICE会根据其他变量的值来预测缺失值,适合缺失率5%-30%、需要考虑变量间相关性的场景。"
|
||||
/>
|
||||
|
||||
<Alert
|
||||
message="⚠️ 重要:MICE仅适用于数值列"
|
||||
type="warning"
|
||||
showIcon
|
||||
description={
|
||||
<div className="text-sm">
|
||||
<div>• ✅ 适合:年龄、体重、血压、评分等数值列</div>
|
||||
<div>• ❌ 不适合:婚姻状况、性别、职业等分类列(请使用"众数填补")</div>
|
||||
<div className="mt-2 text-orange-600">分类列如果被选中,会自动跳过并创建原样副本列</div>
|
||||
</div>
|
||||
}
|
||||
/>
|
||||
|
||||
<div>
|
||||
<div className="mb-2 font-medium">选择要填补的列(可多选):</div>
|
||||
<Checkbox.Group
|
||||
value={miceColumns}
|
||||
onChange={setMiceColumns}
|
||||
style={{ width: '100%' }}
|
||||
>
|
||||
<Space direction="vertical">
|
||||
{columns.map(col => (
|
||||
<Checkbox key={col.id} value={col.name}>
|
||||
{col.name}
|
||||
</Checkbox>
|
||||
))}
|
||||
</Space>
|
||||
</Checkbox.Group>
|
||||
</div>
|
||||
|
||||
<div className="p-3 bg-blue-50 rounded">
|
||||
<div className="text-sm space-y-1">
|
||||
<div>• 新列命名:原列名 + "_MICE"</div>
|
||||
<div>• 新列位置:紧邻各原列</div>
|
||||
<div>• 计算时间:10万行约1分钟</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<Row gutter={16}>
|
||||
<Col span={12}>
|
||||
<div className="mb-2">迭代次数:</div>
|
||||
<InputNumber
|
||||
min={5}
|
||||
max={50}
|
||||
value={nIterations}
|
||||
onChange={(val) => setNIterations(val || 10)}
|
||||
style={{ width: '100%' }}
|
||||
/>
|
||||
</Col>
|
||||
<Col span={12}>
|
||||
<div className="mb-2">随机种子:</div>
|
||||
<InputNumber
|
||||
value={randomState}
|
||||
onChange={(val) => setRandomState(val || 42)}
|
||||
style={{ width: '100%' }}
|
||||
/>
|
||||
</Col>
|
||||
</Row>
|
||||
</div>
|
||||
),
|
||||
},
|
||||
]}
|
||||
/>
|
||||
</Modal>
|
||||
);
|
||||
};
|
||||
|
||||
export default MissingValueDialog;
|
||||
|
||||
@@ -113,7 +113,7 @@ const Toolbar: React.FC<ToolbarProps> = ({
|
||||
/>
|
||||
<ToolbarButton
|
||||
icon={Trash2}
|
||||
label="删除缺失值"
|
||||
label="缺失值处理"
|
||||
colorClass="text-red-600 bg-red-50 hover:bg-red-100"
|
||||
onClick={onDropnaClick}
|
||||
disabled={!sessionId}
|
||||
@@ -125,13 +125,6 @@ const Toolbar: React.FC<ToolbarProps> = ({
|
||||
onClick={onComputeClick}
|
||||
disabled={!sessionId}
|
||||
/>
|
||||
<ToolbarButton
|
||||
icon={FileSearch}
|
||||
label="去重"
|
||||
colorClass="text-orange-600 bg-orange-50 hover:bg-orange-100"
|
||||
onClick={onDedupClick}
|
||||
disabled={true}
|
||||
/>
|
||||
|
||||
<div className="w-[1px] h-8 bg-slate-200 mx-2"></div>
|
||||
|
||||
@@ -143,13 +136,6 @@ const Toolbar: React.FC<ToolbarProps> = ({
|
||||
onClick={onPivotClick}
|
||||
disabled={!sessionId}
|
||||
/>
|
||||
<ToolbarButton
|
||||
icon={CalendarClock}
|
||||
label="多重插补"
|
||||
colorClass="text-rose-600 bg-rose-50 hover:bg-rose-100"
|
||||
onClick={onMiceClick}
|
||||
disabled={true}
|
||||
/>
|
||||
|
||||
<div className="flex-1"></div>
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ import FilterDialog from './components/FilterDialog';
|
||||
import RecodeDialog from './components/RecodeDialog';
|
||||
import BinningDialog from './components/BinningDialog';
|
||||
import ConditionalDialog from './components/ConditionalDialog';
|
||||
import DropnaDialog from './components/DropnaDialog';
|
||||
import MissingValueDialog from './components/MissingValueDialog';
|
||||
import ComputeDialog from './components/ComputeDialog';
|
||||
import PivotDialog from './components/PivotDialog';
|
||||
import * as api from '../../api/toolC';
|
||||
@@ -342,7 +342,7 @@ const ToolC = () => {
|
||||
onApply={handleQuickActionDataUpdate}
|
||||
/>
|
||||
|
||||
<DropnaDialog
|
||||
<MissingValueDialog
|
||||
visible={state.dropnaDialogVisible}
|
||||
columns={state.columns}
|
||||
data={state.data}
|
||||
|
||||
@@ -63,3 +63,5 @@ export interface DataStats {
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -59,3 +59,5 @@ export type AssetTabType = 'all' | 'processed' | 'raw';
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -14,3 +14,5 @@ export { default as Placeholder } from './Placeholder';
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user