Files
AIclinicalresearch/extraction_service/operations/binning.py
HaHafeng f4f1d09837 feat(dc/tool-c): Add pivot column ordering and NA handling features
Major features:
1. Pivot transformation enhancements:
   - Add option to keep unselected columns with 3 aggregation methods
   - Maintain original column order after pivot (aligned with source file)
   - Preserve pivot value order (first appearance order)

2. NA handling across 4 core functions:
   - Recode: Support keep/map/drop for NA values
   - Filter: Already supports is_null/not_null operators
   - Binning: Support keep/label/assign for NA values (fix nan display)
   - Conditional: Add is_null/not_null operators

3. UI improvements:
   - Enable column header tooltips with custom header component
   - Add closeable alert for 50-row preview
   - Fix page scrollbar issues

Modified files:
Python: pivot.py, recode.py, binning.py, conditional.py, main.py
Backend: SessionController, QuickActionController, QuickActionService
Frontend: PivotDialog, RecodeDialog, BinningDialog, ConditionalDialog, DataGrid, index

Status: Ready for testing
2025-12-09 14:40:14 +08:00

200 lines
7.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
生成分类变量(分箱)操作
将连续数值变量转换为分类变量。
支持三种方法:自定义切点、等宽分箱、等频分箱。
"""
import pandas as pd
import numpy as np
from typing import List, Optional, Literal, Union
def apply_binning(
df: pd.DataFrame,
column: str,
method: Literal['custom', 'equal_width', 'equal_freq'],
new_column_name: str,
bins: Optional[List[Union[int, float]]] = None,
labels: Optional[List[Union[str, int]]] = None,
num_bins: int = 3,
na_handling: Literal['keep', 'label', 'assign'] = 'keep',
na_label: Optional[str] = None,
na_assign_to: Optional[int] = None
) -> pd.DataFrame:
"""
应用分箱操作
Args:
df: 输入数据框
column: 要分箱的列名
method: 分箱方法
- 'custom': 自定义切点
- 'equal_width': 等宽分箱
- 'equal_freq': 等频分箱
new_column_name: 新列名
bins: 自定义切点列表仅method='custom'时使用),如 [18, 60] → <18, 18-60, >60
labels: 标签列表(可选)
num_bins: 分组数量仅method='equal_width''equal_freq'时使用)
na_handling: NA值处理方式
- 'keep': 保持为NA默认
- 'label': 标记为指定标签
- 'assign': 分配到指定组
na_label: 当na_handling='label'NA的标签"缺失"
na_assign_to: 当na_handling='assign'NA分配到的组索引
Returns:
分箱后的数据框
Examples:
>>> df = pd.DataFrame({'年龄': [15, 25, 35, 45, 55, 65, 75, None]})
>>> result = apply_binning(df, '年龄', 'custom', '年龄分组',
... bins=[18, 60], labels=['青少年', '成年', '老年'],
... na_handling='label', na_label='缺失')
>>> result['年龄分组'].tolist()
['青少年', '成年', '成年', '成年', '成年', '老年', '老年', '缺失']
"""
if df.empty:
return df
# 验证列是否存在
if column not in df.columns:
raise KeyError(f"'{column}' 不存在")
# 创建结果数据框
result = df.copy()
# ✨ 记录原始NA的位置在分箱前
original_na_mask = result[column].isna()
original_na_count = original_na_mask.sum()
# 验证并转换数据类型
if not pd.api.types.is_numeric_dtype(result[column]):
# 尝试将字符串转换为数值
try:
result[column] = pd.to_numeric(result[column], errors='coerce')
print(f"警告: 列 '{column}' 已自动转换为数值类型")
except Exception as e:
raise TypeError(f"'{column}' 不是数值类型且无法转换,无法进行分箱")
# 检查是否有有效的数值
if result[column].isna().all():
raise ValueError(f"'{column}' 中没有有效的数值,无法进行分箱")
# 根据方法进行分箱
if method == 'custom':
# 自定义切点(用户输入的是中间切点,需要自动添加边界)
if not bins or len(bins) < 1:
raise ValueError('自定义切点至少需要1个值')
# 验证切点是否升序
if bins != sorted(bins):
raise ValueError('切点必须按升序排列')
# 自动添加左右边界
# 重要:始终添加边界,确保切点数+1=区间数
min_val = result[column].min()
max_val = result[column].max()
print(f'用户输入切点: {bins}')
print(f'数据范围: [{min_val:.2f}, {max_val:.2f}]')
# 构建完整的边界数组:始终添加左右边界
# 左边界取min(用户第一个切点, 数据最小值) - 0.001
# 右边界取max(用户最后一个切点, 数据最大值) + 0.001
left_bound = min(bins[0], min_val) - 0.001
right_bound = max(bins[-1], max_val) + 0.001
full_bins = [left_bound] + bins + [right_bound]
print(f'完整边界: {[f"{b:.1f}" for b in full_bins]}')
print(f'将生成 {len(full_bins) - 1} 个区间 = {len(bins) + 1} 个区间')
# 验证标签数量(区间数 = 边界数 - 1
expected_label_count = len(full_bins) - 1
if labels and len(labels) != expected_label_count:
raise ValueError(f'标签数量({len(labels)})必须等于区间数量({expected_label_count}')
result[new_column_name] = pd.cut(
result[column],
bins=full_bins,
labels=labels,
right=False,
include_lowest=True
)
elif method == 'equal_width':
# 等宽分箱
if num_bins < 2:
raise ValueError('分组数量至少为2')
result[new_column_name] = pd.cut(
result[column],
bins=num_bins,
labels=labels,
include_lowest=True
)
elif method == 'equal_freq':
# 等频分箱
if num_bins < 2:
raise ValueError('分组数量至少为2')
result[new_column_name] = pd.qcut(
result[column],
q=num_bins,
labels=labels,
duplicates='drop' # 处理重复边界值
)
else:
raise ValueError(f"不支持的分箱方法: {method}")
# ✨ 重要将Categorical类型转换为object类型避免"nan"字符串问题
result[new_column_name] = result[new_column_name].astype('object')
# ✨ 优化:将新列移到原列旁边
original_col_index = result.columns.get_loc(column)
cols = list(result.columns)
# 移除新列(当前在最后)
cols.remove(new_column_name)
# 插入到原列旁边
cols.insert(original_col_index + 1, new_column_name)
result = result[cols]
# ✨ 处理NA值使用分箱前记录的NA位置
if original_na_count > 0:
if na_handling == 'keep':
# 保持为NA显式设置为None避免显示为"nan"字符串)
result.loc[original_na_mask, new_column_name] = None
print(f'📊 NA处理保持为NA{original_na_count}个)', flush=True)
elif na_handling == 'label':
# 标记为指定标签
label_to_use = na_label if na_label else '空值/NA'
result.loc[original_na_mask, new_column_name] = label_to_use
print(f'📊 NA处理标记为 "{label_to_use}"{original_na_count}个)', flush=True)
elif na_handling == 'assign':
# 分配到指定组通过labels
if labels and na_assign_to is not None and 0 <= na_assign_to < len(labels):
result.loc[original_na_mask, new_column_name] = labels[na_assign_to]
print(f'📊 NA处理分配到组 "{labels[na_assign_to]}"{original_na_count}个)', flush=True)
else:
print(f'⚠️ 警告na_assign_to无效NA保持为空', flush=True)
# 统计分布
print(f'分箱结果分布:')
value_counts = result[new_column_name].value_counts().sort_index()
for category, count in value_counts.items():
percentage = count / len(result) * 100
print(f' {category}: {count} 行 ({percentage:.1f}%)')
# 缺失值统计
missing_count = result[new_column_name].isna().sum()
if missing_count > 0:
print(f'警告: {missing_count} 个值无法分箱(可能是缺失值或边界问题)')
return result