feat(dc/tool-c): Add pivot column ordering and NA handling features
Major features: 1. Pivot transformation enhancements: - Add option to keep unselected columns with 3 aggregation methods - Maintain original column order after pivot (aligned with source file) - Preserve pivot value order (first appearance order) 2. NA handling across 4 core functions: - Recode: Support keep/map/drop for NA values - Filter: Already supports is_null/not_null operators - Binning: Support keep/label/assign for NA values (fix nan display) - Conditional: Add is_null/not_null operators 3. UI improvements: - Enable column header tooltips with custom header component - Add closeable alert for 50-row preview - Fix page scrollbar issues Modified files: Python: pivot.py, recode.py, binning.py, conditional.py, main.py Backend: SessionController, QuickActionController, QuickActionService Frontend: PivotDialog, RecodeDialog, BinningDialog, ConditionalDialog, DataGrid, index Status: Ready for testing
This commit is contained in:
@@ -17,7 +17,10 @@ def apply_binning(
|
||||
new_column_name: str,
|
||||
bins: Optional[List[Union[int, float]]] = None,
|
||||
labels: Optional[List[Union[str, int]]] = None,
|
||||
num_bins: int = 3
|
||||
num_bins: int = 3,
|
||||
na_handling: Literal['keep', 'label', 'assign'] = 'keep',
|
||||
na_label: Optional[str] = None,
|
||||
na_assign_to: Optional[int] = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
应用分箱操作
|
||||
@@ -33,16 +36,23 @@ def apply_binning(
|
||||
bins: 自定义切点列表(仅method='custom'时使用),如 [18, 60] → <18, 18-60, >60
|
||||
labels: 标签列表(可选)
|
||||
num_bins: 分组数量(仅method='equal_width'或'equal_freq'时使用)
|
||||
na_handling: NA值处理方式
|
||||
- 'keep': 保持为NA(默认)
|
||||
- 'label': 标记为指定标签
|
||||
- 'assign': 分配到指定组
|
||||
na_label: 当na_handling='label'时,NA的标签(如"缺失")
|
||||
na_assign_to: 当na_handling='assign'时,NA分配到的组索引
|
||||
|
||||
Returns:
|
||||
分箱后的数据框
|
||||
|
||||
Examples:
|
||||
>>> df = pd.DataFrame({'年龄': [15, 25, 35, 45, 55, 65, 75]})
|
||||
>>> df = pd.DataFrame({'年龄': [15, 25, 35, 45, 55, 65, 75, None]})
|
||||
>>> result = apply_binning(df, '年龄', 'custom', '年龄分组',
|
||||
... bins=[18, 60], labels=['青少年', '成年', '老年'])
|
||||
... bins=[18, 60], labels=['青少年', '成年', '老年'],
|
||||
... na_handling='label', na_label='缺失')
|
||||
>>> result['年龄分组'].tolist()
|
||||
['青少年', '成年', '成年', '成年', '成年', '老年', '老年']
|
||||
['青少年', '成年', '成年', '成年', '成年', '老年', '老年', '缺失']
|
||||
"""
|
||||
if df.empty:
|
||||
return df
|
||||
@@ -54,6 +64,10 @@ def apply_binning(
|
||||
# 创建结果数据框
|
||||
result = df.copy()
|
||||
|
||||
# ✨ 记录原始NA的位置(在分箱前)
|
||||
original_na_mask = result[column].isna()
|
||||
original_na_count = original_na_mask.sum()
|
||||
|
||||
# 验证并转换数据类型
|
||||
if not pd.api.types.is_numeric_dtype(result[column]):
|
||||
# 尝试将字符串转换为数值
|
||||
@@ -136,6 +150,9 @@ def apply_binning(
|
||||
else:
|
||||
raise ValueError(f"不支持的分箱方法: {method}")
|
||||
|
||||
# ✨ 重要:将Categorical类型转换为object类型,避免"nan"字符串问题
|
||||
result[new_column_name] = result[new_column_name].astype('object')
|
||||
|
||||
# ✨ 优化:将新列移到原列旁边
|
||||
original_col_index = result.columns.get_loc(column)
|
||||
cols = list(result.columns)
|
||||
@@ -145,6 +162,27 @@ def apply_binning(
|
||||
cols.insert(original_col_index + 1, new_column_name)
|
||||
result = result[cols]
|
||||
|
||||
# ✨ 处理NA值(使用分箱前记录的NA位置)
|
||||
if original_na_count > 0:
|
||||
if na_handling == 'keep':
|
||||
# 保持为NA(显式设置为None,避免显示为"nan"字符串)
|
||||
result.loc[original_na_mask, new_column_name] = None
|
||||
print(f'📊 NA处理:保持为NA({original_na_count}个)', flush=True)
|
||||
|
||||
elif na_handling == 'label':
|
||||
# 标记为指定标签
|
||||
label_to_use = na_label if na_label else '空值/NA'
|
||||
result.loc[original_na_mask, new_column_name] = label_to_use
|
||||
print(f'📊 NA处理:标记为 "{label_to_use}"({original_na_count}个)', flush=True)
|
||||
|
||||
elif na_handling == 'assign':
|
||||
# 分配到指定组(通过labels)
|
||||
if labels and na_assign_to is not None and 0 <= na_assign_to < len(labels):
|
||||
result.loc[original_na_mask, new_column_name] = labels[na_assign_to]
|
||||
print(f'📊 NA处理:分配到组 "{labels[na_assign_to]}"({original_na_count}个)', flush=True)
|
||||
else:
|
||||
print(f'⚠️ 警告:na_assign_to无效,NA保持为空', flush=True)
|
||||
|
||||
# 统计分布
|
||||
print(f'分箱结果分布:')
|
||||
value_counts = result[new_column_name].value_counts().sort_index()
|
||||
|
||||
Reference in New Issue
Block a user