Summary: - Complete IIT Manager Agent MVP Day 1 (12.5% progress) - Database: Create iit_schema with 5 tables (IitProject, IitPendingAction, IitTaskRun, IitUserMapping, IitAuditLog) - Backend: Add module structure (577 lines) and types (223 lines) - WeChat: Configure Enterprise WeChat app (CorpID, AgentID, Secret) - WeChat: Obtain web authorization and JS-SDK authorization - WeChat: Configure trusted domain (iit.xunzhengyixue.com) - Frontend: Deploy v1.2 with WeChat domain verification file - Frontend: Fix CRLF issue in docker-entrypoint.sh (CRLF -> LF) - Testing: 11/11 database CRUD tests passed - Testing: Access Token retrieval test passed - Docs: Create module status and development guide - Docs: Update MVP task list with Day 1 completion - Docs: Rename deployment doc to SAE real-time status record - Deployment: Update frontend internal IP to 172.17.173.80 Technical Details: - Prisma: Multi-schema support (iit_schema) - pg-boss: Job queue integration prepared - Taro 4.x: Framework selected for WeChat Mini Program - Shadow State: Architecture foundation laid - Docker: Fix entrypoint script line endings for Linux container Status: Day 1/14 complete, ready for Day 2 REDCap integration
180 lines
4.7 KiB
Python
180 lines
4.7 KiB
Python
"""
|
||
删除缺失值 - 预写函数
|
||
支持按行删除、按列删除、阈值控制
|
||
"""
|
||
|
||
import pandas as pd
|
||
from typing import Literal, Optional, List
|
||
|
||
|
||
def drop_missing_values(
|
||
df: pd.DataFrame,
|
||
method: Literal['row', 'column', 'both'] = 'row',
|
||
threshold: Optional[float] = None,
|
||
subset: Optional[List[str]] = None
|
||
) -> pd.DataFrame:
|
||
"""
|
||
删除缺失值
|
||
|
||
Args:
|
||
df: 输入数据框
|
||
method: 删除方式
|
||
- 'row': 删除包含缺失值的行
|
||
- 'column': 删除缺失值过多的列
|
||
- 'both': 先删除列,再删除行
|
||
threshold: 缺失率阈值(0-1之间),仅对'column'和'both'有效
|
||
- 如果列的缺失率超过此阈值,则删除该列
|
||
- 默认为0.5(50%)
|
||
subset: 仅检查指定列的缺失值(仅对'row'有效)
|
||
|
||
Returns:
|
||
删除缺失值后的数据框
|
||
|
||
示例:
|
||
# 删除包含任何缺失值的行
|
||
drop_missing_values(df, method='row')
|
||
|
||
# 删除缺失率>30%的列
|
||
drop_missing_values(df, method='column', threshold=0.3)
|
||
|
||
# 先删除缺失列,再删除缺失行
|
||
drop_missing_values(df, method='both', threshold=0.5)
|
||
|
||
# 仅检查指定列
|
||
drop_missing_values(df, method='row', subset=['年龄', 'BMI'])
|
||
"""
|
||
result = df.copy()
|
||
original_shape = result.shape
|
||
|
||
print(f'原始数据: {original_shape[0]} 行 × {original_shape[1]} 列')
|
||
print(f'缺失值总数: {result.isna().sum().sum()}')
|
||
print('')
|
||
|
||
# 默认阈值
|
||
if threshold is None:
|
||
threshold = 0.5
|
||
|
||
# 按列删除
|
||
if method in ('column', 'both'):
|
||
# 计算每列的缺失率
|
||
missing_rate = result.isna().sum() / len(result)
|
||
cols_to_drop = missing_rate[missing_rate > threshold].index.tolist()
|
||
|
||
if cols_to_drop:
|
||
print(f'检测到缺失率>{threshold*100:.0f}%的列: {len(cols_to_drop)}个')
|
||
for col in cols_to_drop:
|
||
rate = missing_rate[col]
|
||
count = result[col].isna().sum()
|
||
print(f' - {col}: 缺失率={rate*100:.1f}% ({count}/{len(result)})')
|
||
|
||
result = result.drop(columns=cols_to_drop)
|
||
print(f'删除后: {result.shape[0]} 行 × {result.shape[1]} 列')
|
||
print('')
|
||
else:
|
||
print(f'没有找到缺失率>{threshold*100:.0f}%的列')
|
||
print('')
|
||
|
||
# 按行删除
|
||
if method in ('row', 'both'):
|
||
before_rows = len(result)
|
||
|
||
if subset:
|
||
# 仅检查指定列
|
||
print(f'仅检查指定列的缺失值: {subset}')
|
||
result = result.dropna(subset=subset)
|
||
else:
|
||
# 检查所有列
|
||
result = result.dropna()
|
||
|
||
dropped_rows = before_rows - len(result)
|
||
if dropped_rows > 0:
|
||
print(f'删除了 {dropped_rows} 行(包含缺失值的行)')
|
||
print(f'保留了 {len(result)} 行({len(result)/before_rows*100:.1f}%)')
|
||
else:
|
||
print('没有找到包含缺失值的行')
|
||
print('')
|
||
|
||
# 最终统计
|
||
final_shape = result.shape
|
||
print(f'最终结果: {final_shape[0]} 行 × {final_shape[1]} 列')
|
||
print(f'删除了 {original_shape[0] - final_shape[0]} 行')
|
||
print(f'删除了 {original_shape[1] - final_shape[1]} 列')
|
||
print(f'剩余缺失值: {result.isna().sum().sum()}')
|
||
|
||
# 如果结果为空,给出警告
|
||
if len(result) == 0:
|
||
print('\n⚠️ 警告: 删除后数据为空!')
|
||
|
||
return result
|
||
|
||
|
||
def get_missing_summary(df: pd.DataFrame) -> dict:
|
||
"""
|
||
获取缺失值统计摘要
|
||
|
||
Args:
|
||
df: 输入数据框
|
||
|
||
Returns:
|
||
缺失值统计信息
|
||
"""
|
||
total_cells = df.shape[0] * df.shape[1]
|
||
total_missing = df.isna().sum().sum()
|
||
|
||
# 按列统计
|
||
col_missing = df.isna().sum()
|
||
col_missing_rate = col_missing / len(df)
|
||
|
||
cols_with_missing = col_missing[col_missing > 0].to_dict()
|
||
cols_missing_rate = col_missing_rate[col_missing > 0].to_dict()
|
||
|
||
# 按行统计
|
||
row_missing = df.isna().sum(axis=1)
|
||
rows_with_missing = (row_missing > 0).sum()
|
||
|
||
return {
|
||
'total_cells': total_cells,
|
||
'total_missing': int(total_missing),
|
||
'missing_rate': total_missing / total_cells if total_cells > 0 else 0,
|
||
'rows_with_missing': int(rows_with_missing),
|
||
'cols_with_missing': len(cols_with_missing),
|
||
'col_missing_detail': {
|
||
col: {
|
||
'count': int(count),
|
||
'rate': float(cols_missing_rate[col])
|
||
}
|
||
for col, count in cols_with_missing.items()
|
||
}
|
||
}
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|