feat(dc/tool-c): Add pivot column ordering and NA handling features

Major features:
1. Pivot transformation enhancements:
   - Add option to keep unselected columns with 3 aggregation methods
   - Maintain original column order after pivot (aligned with source file)
   - Preserve pivot value order (first appearance order)

2. NA handling across 4 core functions:
   - Recode: Support keep/map/drop for NA values
   - Filter: Already supports is_null/not_null operators
   - Binning: Support keep/label/assign for NA values (fix nan display)
   - Conditional: Add is_null/not_null operators

3. UI improvements:
   - Enable column header tooltips with custom header component
   - Add closeable alert for 50-row preview
   - Fix page scrollbar issues

Modified files:
Python: pivot.py, recode.py, binning.py, conditional.py, main.py
Backend: SessionController, QuickActionController, QuickActionService
Frontend: PivotDialog, RecodeDialog, BinningDialog, ConditionalDialog, DataGrid, index

Status: Ready for testing
This commit is contained in:
2025-12-09 14:40:14 +08:00
parent 75ceeb0653
commit f4f1d09837
19 changed files with 2314 additions and 123 deletions

View File

@@ -17,7 +17,10 @@ def apply_binning(
new_column_name: str,
bins: Optional[List[Union[int, float]]] = None,
labels: Optional[List[Union[str, int]]] = None,
num_bins: int = 3
num_bins: int = 3,
na_handling: Literal['keep', 'label', 'assign'] = 'keep',
na_label: Optional[str] = None,
na_assign_to: Optional[int] = None
) -> pd.DataFrame:
"""
应用分箱操作
@@ -33,16 +36,23 @@ def apply_binning(
bins: 自定义切点列表仅method='custom'时使用),如 [18, 60] → <18, 18-60, >60
labels: 标签列表(可选)
num_bins: 分组数量仅method='equal_width''equal_freq'时使用)
na_handling: NA值处理方式
- 'keep': 保持为NA默认
- 'label': 标记为指定标签
- 'assign': 分配到指定组
na_label: 当na_handling='label'NA的标签"缺失"
na_assign_to: 当na_handling='assign'NA分配到的组索引
Returns:
分箱后的数据框
Examples:
>>> df = pd.DataFrame({'年龄': [15, 25, 35, 45, 55, 65, 75]})
>>> df = pd.DataFrame({'年龄': [15, 25, 35, 45, 55, 65, 75, None]})
>>> result = apply_binning(df, '年龄', 'custom', '年龄分组',
... bins=[18, 60], labels=['青少年', '成年', '老年'])
... bins=[18, 60], labels=['青少年', '成年', '老年'],
... na_handling='label', na_label='缺失')
>>> result['年龄分组'].tolist()
['青少年', '成年', '成年', '成年', '成年', '老年', '老年']
['青少年', '成年', '成年', '成年', '成年', '老年', '老年', '缺失']
"""
if df.empty:
return df
@@ -54,6 +64,10 @@ def apply_binning(
# 创建结果数据框
result = df.copy()
# ✨ 记录原始NA的位置在分箱前
original_na_mask = result[column].isna()
original_na_count = original_na_mask.sum()
# 验证并转换数据类型
if not pd.api.types.is_numeric_dtype(result[column]):
# 尝试将字符串转换为数值
@@ -136,6 +150,9 @@ def apply_binning(
else:
raise ValueError(f"不支持的分箱方法: {method}")
# ✨ 重要将Categorical类型转换为object类型避免"nan"字符串问题
result[new_column_name] = result[new_column_name].astype('object')
# ✨ 优化:将新列移到原列旁边
original_col_index = result.columns.get_loc(column)
cols = list(result.columns)
@@ -145,6 +162,27 @@ def apply_binning(
cols.insert(original_col_index + 1, new_column_name)
result = result[cols]
# ✨ 处理NA值使用分箱前记录的NA位置
if original_na_count > 0:
if na_handling == 'keep':
# 保持为NA显式设置为None避免显示为"nan"字符串)
result.loc[original_na_mask, new_column_name] = None
print(f'📊 NA处理保持为NA{original_na_count}个)', flush=True)
elif na_handling == 'label':
# 标记为指定标签
label_to_use = na_label if na_label else '空值/NA'
result.loc[original_na_mask, new_column_name] = label_to_use
print(f'📊 NA处理标记为 "{label_to_use}"{original_na_count}个)', flush=True)
elif na_handling == 'assign':
# 分配到指定组通过labels
if labels and na_assign_to is not None and 0 <= na_assign_to < len(labels):
result.loc[original_na_mask, new_column_name] = labels[na_assign_to]
print(f'📊 NA处理分配到组 "{labels[na_assign_to]}"{original_na_count}个)', flush=True)
else:
print(f'⚠️ 警告na_assign_to无效NA保持为空', flush=True)
# 统计分布
print(f'分箱结果分布:')
value_counts = result[new_column_name].value_counts().sort_index()