Major fixes: - Fix pivot transformation with special characters in column names - Fix compute column validation for Chinese punctuation - Fix recode dialog to fetch unique values from full dataset via new API - Add column mapping mechanism to handle special characters Database migration: - Add column_mapping field to dc_tool_c_sessions table - Migration file: 20251208_add_column_mapping UX improvements: - Darken table grid lines for better visibility - Reduce column width by 40% with tooltip support - Insert new columns next to source columns - Preserve original row order after operations - Add notice about 50-row preview limit Modified files: - Backend: SessionService, SessionController, QuickActionService, routes - Python: pivot.py, compute.py, recode.py, binning.py, conditional.py - Frontend: DataGrid, RecodeDialog, index.tsx, ag-grid-custom.css - Database: schema.prisma, migration SQL Status: Code complete, database migrated, ready for testing
151 lines
4.7 KiB
Python
151 lines
4.7 KiB
Python
"""
|
||
删除缺失值 - 预写函数
|
||
支持按行删除、按列删除、阈值控制
|
||
"""
|
||
|
||
import pandas as pd
|
||
from typing import Literal, Optional, List
|
||
|
||
|
||
def drop_missing_values(
|
||
df: pd.DataFrame,
|
||
method: Literal['row', 'column', 'both'] = 'row',
|
||
threshold: Optional[float] = None,
|
||
subset: Optional[List[str]] = None
|
||
) -> pd.DataFrame:
|
||
"""
|
||
删除缺失值
|
||
|
||
Args:
|
||
df: 输入数据框
|
||
method: 删除方式
|
||
- 'row': 删除包含缺失值的行
|
||
- 'column': 删除缺失值过多的列
|
||
- 'both': 先删除列,再删除行
|
||
threshold: 缺失率阈值(0-1之间),仅对'column'和'both'有效
|
||
- 如果列的缺失率超过此阈值,则删除该列
|
||
- 默认为0.5(50%)
|
||
subset: 仅检查指定列的缺失值(仅对'row'有效)
|
||
|
||
Returns:
|
||
删除缺失值后的数据框
|
||
|
||
示例:
|
||
# 删除包含任何缺失值的行
|
||
drop_missing_values(df, method='row')
|
||
|
||
# 删除缺失率>30%的列
|
||
drop_missing_values(df, method='column', threshold=0.3)
|
||
|
||
# 先删除缺失列,再删除缺失行
|
||
drop_missing_values(df, method='both', threshold=0.5)
|
||
|
||
# 仅检查指定列
|
||
drop_missing_values(df, method='row', subset=['年龄', 'BMI'])
|
||
"""
|
||
result = df.copy()
|
||
original_shape = result.shape
|
||
|
||
print(f'原始数据: {original_shape[0]} 行 × {original_shape[1]} 列')
|
||
print(f'缺失值总数: {result.isna().sum().sum()}')
|
||
print('')
|
||
|
||
# 默认阈值
|
||
if threshold is None:
|
||
threshold = 0.5
|
||
|
||
# 按列删除
|
||
if method in ('column', 'both'):
|
||
# 计算每列的缺失率
|
||
missing_rate = result.isna().sum() / len(result)
|
||
cols_to_drop = missing_rate[missing_rate > threshold].index.tolist()
|
||
|
||
if cols_to_drop:
|
||
print(f'检测到缺失率>{threshold*100:.0f}%的列: {len(cols_to_drop)}个')
|
||
for col in cols_to_drop:
|
||
rate = missing_rate[col]
|
||
count = result[col].isna().sum()
|
||
print(f' - {col}: 缺失率={rate*100:.1f}% ({count}/{len(result)})')
|
||
|
||
result = result.drop(columns=cols_to_drop)
|
||
print(f'删除后: {result.shape[0]} 行 × {result.shape[1]} 列')
|
||
print('')
|
||
else:
|
||
print(f'没有找到缺失率>{threshold*100:.0f}%的列')
|
||
print('')
|
||
|
||
# 按行删除
|
||
if method in ('row', 'both'):
|
||
before_rows = len(result)
|
||
|
||
if subset:
|
||
# 仅检查指定列
|
||
print(f'仅检查指定列的缺失值: {subset}')
|
||
result = result.dropna(subset=subset)
|
||
else:
|
||
# 检查所有列
|
||
result = result.dropna()
|
||
|
||
dropped_rows = before_rows - len(result)
|
||
if dropped_rows > 0:
|
||
print(f'删除了 {dropped_rows} 行(包含缺失值的行)')
|
||
print(f'保留了 {len(result)} 行({len(result)/before_rows*100:.1f}%)')
|
||
else:
|
||
print('没有找到包含缺失值的行')
|
||
print('')
|
||
|
||
# 最终统计
|
||
final_shape = result.shape
|
||
print(f'最终结果: {final_shape[0]} 行 × {final_shape[1]} 列')
|
||
print(f'删除了 {original_shape[0] - final_shape[0]} 行')
|
||
print(f'删除了 {original_shape[1] - final_shape[1]} 列')
|
||
print(f'剩余缺失值: {result.isna().sum().sum()}')
|
||
|
||
# 如果结果为空,给出警告
|
||
if len(result) == 0:
|
||
print('\n⚠️ 警告: 删除后数据为空!')
|
||
|
||
return result
|
||
|
||
|
||
def get_missing_summary(df: pd.DataFrame) -> dict:
|
||
"""
|
||
获取缺失值统计摘要
|
||
|
||
Args:
|
||
df: 输入数据框
|
||
|
||
Returns:
|
||
缺失值统计信息
|
||
"""
|
||
total_cells = df.shape[0] * df.shape[1]
|
||
total_missing = df.isna().sum().sum()
|
||
|
||
# 按列统计
|
||
col_missing = df.isna().sum()
|
||
col_missing_rate = col_missing / len(df)
|
||
|
||
cols_with_missing = col_missing[col_missing > 0].to_dict()
|
||
cols_missing_rate = col_missing_rate[col_missing > 0].to_dict()
|
||
|
||
# 按行统计
|
||
row_missing = df.isna().sum(axis=1)
|
||
rows_with_missing = (row_missing > 0).sum()
|
||
|
||
return {
|
||
'total_cells': total_cells,
|
||
'total_missing': int(total_missing),
|
||
'missing_rate': total_missing / total_cells if total_cells > 0 else 0,
|
||
'rows_with_missing': int(rows_with_missing),
|
||
'cols_with_missing': len(cols_with_missing),
|
||
'col_missing_detail': {
|
||
col: {
|
||
'count': int(count),
|
||
'rate': float(cols_missing_rate[col])
|
||
}
|
||
for col, count in cols_with_missing.items()
|
||
}
|
||
}
|
||
|
||
|