feat(dc/tool-c): Add pivot column ordering and NA handling features
Major features: 1. Pivot transformation enhancements: - Add option to keep unselected columns with 3 aggregation methods - Maintain original column order after pivot (aligned with source file) - Preserve pivot value order (first appearance order) 2. NA handling across 4 core functions: - Recode: Support keep/map/drop for NA values - Filter: Already supports is_null/not_null operators - Binning: Support keep/label/assign for NA values (fix nan display) - Conditional: Add is_null/not_null operators 3. UI improvements: - Enable column header tooltips with custom header component - Add closeable alert for 50-row preview - Fix page scrollbar issues Modified files: Python: pivot.py, recode.py, binning.py, conditional.py, main.py Backend: SessionController, QuickActionController, QuickActionService Frontend: PivotDialog, RecodeDialog, BinningDialog, ConditionalDialog, DataGrid, index Status: Ready for testing
This commit is contained in:
@@ -97,6 +97,8 @@ class RecodeRequest(BaseModel):
|
||||
mapping: Dict[Any, Any]
|
||||
create_new_column: bool = True
|
||||
new_column_name: str = None
|
||||
na_handling: str = 'keep' # ✨ 新增:NA处理方式(keep/map/drop)
|
||||
na_value: Any = None # ✨ 新增:NA映射值
|
||||
|
||||
class BinningRequest(BaseModel):
|
||||
"""分箱请求模型"""
|
||||
@@ -107,6 +109,9 @@ class BinningRequest(BaseModel):
|
||||
bins: List[Any] = None
|
||||
labels: List[Any] = None
|
||||
num_bins: int = 3
|
||||
na_handling: str = 'keep' # ✨ 新增:NA处理方式(keep/label/assign)
|
||||
na_label: str = None # ✨ 新增:NA标签
|
||||
na_assign_to: int = None # ✨ 新增:NA分配到的组索引
|
||||
|
||||
class ConditionalRequest(BaseModel):
|
||||
"""条件生成列请求模型"""
|
||||
@@ -127,6 +132,7 @@ class ComputeRequest(BaseModel):
|
||||
data: List[Dict[str, Any]]
|
||||
new_column_name: str
|
||||
formula: str
|
||||
column_mapping: List[Dict[str, str]] = [] # ✨ 新增:列名映射
|
||||
|
||||
class PivotRequest(BaseModel):
|
||||
"""Pivot请求模型"""
|
||||
@@ -135,6 +141,11 @@ class PivotRequest(BaseModel):
|
||||
pivot_column: str
|
||||
value_columns: List[str]
|
||||
aggfunc: str = 'first'
|
||||
column_mapping: List[Dict[str, str]] = [] # ✨ 列名映射
|
||||
keep_unused_columns: bool = False # ✨ 是否保留未选择的列
|
||||
unused_agg_method: str = 'first' # ✨ 未选择列的聚合方式(first/mode/mean)
|
||||
original_column_order: List[str] = [] # ✨ 新增:原始列顺序
|
||||
pivot_value_order: List[str] = [] # ✨ 新增:透视列值的原始顺序
|
||||
|
||||
|
||||
# ==================== API路由 ====================
|
||||
@@ -763,13 +774,15 @@ async def operation_recode(request: RecodeRequest):
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# 调用预写函数
|
||||
# 调用预写函数(传递NA处理参数)
|
||||
result_df = apply_recode(
|
||||
df,
|
||||
request.column,
|
||||
request.mapping,
|
||||
request.create_new_column,
|
||||
request.new_column_name
|
||||
request.new_column_name,
|
||||
request.na_handling, # ✨ NA处理方式
|
||||
request.na_value # ✨ NA映射值
|
||||
)
|
||||
|
||||
# 转换回JSON(处理NaN和inf值)
|
||||
@@ -840,7 +853,7 @@ async def operation_binning(request: BinningRequest):
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# 调用预写函数
|
||||
# 调用预写函数(传递NA处理参数)
|
||||
result_df = apply_binning(
|
||||
df,
|
||||
request.column,
|
||||
@@ -848,7 +861,10 @@ async def operation_binning(request: BinningRequest):
|
||||
request.new_column_name,
|
||||
request.bins,
|
||||
request.labels,
|
||||
request.num_bins
|
||||
request.num_bins,
|
||||
request.na_handling, # ✨ NA处理方式
|
||||
request.na_label, # ✨ NA标签
|
||||
request.na_assign_to # ✨ NA分配到的组索引
|
||||
)
|
||||
|
||||
# 转换回JSON(处理Categorical类型、NaN值和inf值)
|
||||
@@ -1106,11 +1122,12 @@ async def operation_compute(request: ComputeRequest):
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# 调用预写函数
|
||||
# ✨ 调用预写函数(传递column_mapping)
|
||||
result_df = compute_column(
|
||||
df,
|
||||
request.new_column_name,
|
||||
request.formula
|
||||
request.formula,
|
||||
request.column_mapping # ✨ 传递列名映射
|
||||
)
|
||||
|
||||
# 转换回JSON(处理NaN值和inf值)
|
||||
@@ -1201,13 +1218,18 @@ async def operation_pivot(request: PivotRequest):
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# 调用预写函数
|
||||
# ✨ 调用预写函数(传递column_mapping和未选择列处理参数)
|
||||
result_df = pivot_long_to_wide(
|
||||
df,
|
||||
request.index_column,
|
||||
request.pivot_column,
|
||||
request.value_columns,
|
||||
request.aggfunc
|
||||
request.aggfunc,
|
||||
request.column_mapping, # ✨ 传递列名映射
|
||||
request.keep_unused_columns, # ✨ 是否保留未选择的列
|
||||
request.unused_agg_method, # ✨ 未选择列的聚合方式
|
||||
request.original_column_order, # ✨ 原始列顺序
|
||||
request.pivot_value_order # ✨ 透视列值的原始顺序
|
||||
)
|
||||
|
||||
# 转换回JSON(处理NaN和inf值)
|
||||
|
||||
@@ -17,7 +17,10 @@ def apply_binning(
|
||||
new_column_name: str,
|
||||
bins: Optional[List[Union[int, float]]] = None,
|
||||
labels: Optional[List[Union[str, int]]] = None,
|
||||
num_bins: int = 3
|
||||
num_bins: int = 3,
|
||||
na_handling: Literal['keep', 'label', 'assign'] = 'keep',
|
||||
na_label: Optional[str] = None,
|
||||
na_assign_to: Optional[int] = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
应用分箱操作
|
||||
@@ -33,16 +36,23 @@ def apply_binning(
|
||||
bins: 自定义切点列表(仅method='custom'时使用),如 [18, 60] → <18, 18-60, >60
|
||||
labels: 标签列表(可选)
|
||||
num_bins: 分组数量(仅method='equal_width'或'equal_freq'时使用)
|
||||
na_handling: NA值处理方式
|
||||
- 'keep': 保持为NA(默认)
|
||||
- 'label': 标记为指定标签
|
||||
- 'assign': 分配到指定组
|
||||
na_label: 当na_handling='label'时,NA的标签(如"缺失")
|
||||
na_assign_to: 当na_handling='assign'时,NA分配到的组索引
|
||||
|
||||
Returns:
|
||||
分箱后的数据框
|
||||
|
||||
Examples:
|
||||
>>> df = pd.DataFrame({'年龄': [15, 25, 35, 45, 55, 65, 75]})
|
||||
>>> df = pd.DataFrame({'年龄': [15, 25, 35, 45, 55, 65, 75, None]})
|
||||
>>> result = apply_binning(df, '年龄', 'custom', '年龄分组',
|
||||
... bins=[18, 60], labels=['青少年', '成年', '老年'])
|
||||
... bins=[18, 60], labels=['青少年', '成年', '老年'],
|
||||
... na_handling='label', na_label='缺失')
|
||||
>>> result['年龄分组'].tolist()
|
||||
['青少年', '成年', '成年', '成年', '成年', '老年', '老年']
|
||||
['青少年', '成年', '成年', '成年', '成年', '老年', '老年', '缺失']
|
||||
"""
|
||||
if df.empty:
|
||||
return df
|
||||
@@ -54,6 +64,10 @@ def apply_binning(
|
||||
# 创建结果数据框
|
||||
result = df.copy()
|
||||
|
||||
# ✨ 记录原始NA的位置(在分箱前)
|
||||
original_na_mask = result[column].isna()
|
||||
original_na_count = original_na_mask.sum()
|
||||
|
||||
# 验证并转换数据类型
|
||||
if not pd.api.types.is_numeric_dtype(result[column]):
|
||||
# 尝试将字符串转换为数值
|
||||
@@ -136,6 +150,9 @@ def apply_binning(
|
||||
else:
|
||||
raise ValueError(f"不支持的分箱方法: {method}")
|
||||
|
||||
# ✨ 重要:将Categorical类型转换为object类型,避免"nan"字符串问题
|
||||
result[new_column_name] = result[new_column_name].astype('object')
|
||||
|
||||
# ✨ 优化:将新列移到原列旁边
|
||||
original_col_index = result.columns.get_loc(column)
|
||||
cols = list(result.columns)
|
||||
@@ -145,6 +162,27 @@ def apply_binning(
|
||||
cols.insert(original_col_index + 1, new_column_name)
|
||||
result = result[cols]
|
||||
|
||||
# ✨ 处理NA值(使用分箱前记录的NA位置)
|
||||
if original_na_count > 0:
|
||||
if na_handling == 'keep':
|
||||
# 保持为NA(显式设置为None,避免显示为"nan"字符串)
|
||||
result.loc[original_na_mask, new_column_name] = None
|
||||
print(f'📊 NA处理:保持为NA({original_na_count}个)', flush=True)
|
||||
|
||||
elif na_handling == 'label':
|
||||
# 标记为指定标签
|
||||
label_to_use = na_label if na_label else '空值/NA'
|
||||
result.loc[original_na_mask, new_column_name] = label_to_use
|
||||
print(f'📊 NA处理:标记为 "{label_to_use}"({original_na_count}个)', flush=True)
|
||||
|
||||
elif na_handling == 'assign':
|
||||
# 分配到指定组(通过labels)
|
||||
if labels and na_assign_to is not None and 0 <= na_assign_to < len(labels):
|
||||
result.loc[original_na_mask, new_column_name] = labels[na_assign_to]
|
||||
print(f'📊 NA处理:分配到组 "{labels[na_assign_to]}"({original_na_count}个)', flush=True)
|
||||
else:
|
||||
print(f'⚠️ 警告:na_assign_to无效,NA保持为空', flush=True)
|
||||
|
||||
# 统计分布
|
||||
print(f'分箱结果分布:')
|
||||
value_counts = result[new_column_name].value_counts().sort_index()
|
||||
|
||||
@@ -109,6 +109,10 @@ def apply_conditional_column(
|
||||
mask = col_data >= value
|
||||
elif operator == '<=':
|
||||
mask = col_data <= value
|
||||
elif operator == 'is_null': # ✨ 新增:为空
|
||||
mask = result[column].isna()
|
||||
elif operator == 'not_null': # ✨ 新增:不为空
|
||||
mask = result[column].notna()
|
||||
else:
|
||||
raise ValueError(f'不支持的运算符: {operator}')
|
||||
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
"""
|
||||
Pivot操作 - 预写函数
|
||||
长表转宽表(一人多行 → 一人一行)
|
||||
|
||||
✨ 方案B实现:支持列名映射
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import List, Literal, Optional
|
||||
from typing import List, Literal, Optional, Dict
|
||||
|
||||
|
||||
def pivot_long_to_wide(
|
||||
@@ -12,7 +14,12 @@ def pivot_long_to_wide(
|
||||
index_column: str,
|
||||
pivot_column: str,
|
||||
value_columns: List[str],
|
||||
aggfunc: Literal['first', 'last', 'mean', 'sum', 'min', 'max'] = 'first'
|
||||
aggfunc: Literal['first', 'last', 'mean', 'sum', 'min', 'max'] = 'first',
|
||||
column_mapping: Optional[List[Dict[str, str]]] = None,
|
||||
keep_unused_columns: bool = False,
|
||||
unused_agg_method: Literal['first', 'mode', 'mean'] = 'first',
|
||||
original_column_order: Optional[List[str]] = None,
|
||||
pivot_value_order: Optional[List[str]] = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
长表转宽表(Pivot)
|
||||
@@ -25,33 +32,39 @@ def pivot_long_to_wide(
|
||||
pivot_column: 透视列(将变成新列名的列,如 Event Name)
|
||||
value_columns: 值列(要转置的数据列,如 FMA得分, ADL得分)
|
||||
aggfunc: 聚合函数
|
||||
- 'first': 取第一个值(推荐)
|
||||
- 'last': 取最后一个值
|
||||
- 'mean': 求平均值
|
||||
- 'sum': 求和
|
||||
- 'min': 取最小值
|
||||
- 'max': 取最大值
|
||||
column_mapping: 列名映射(可选)
|
||||
keep_unused_columns: 是否保留未选择的列(默认False)
|
||||
unused_agg_method: 未选择列的聚合方式('first'=取第一个值, 'mode'=取众数, 'mean'=取均值)
|
||||
original_column_order: 原始列顺序(用于保持列顺序一致)
|
||||
pivot_value_order: 透视列值的原始顺序(用于保持透视值顺序一致)
|
||||
|
||||
Returns:
|
||||
宽表数据框
|
||||
|
||||
示例:
|
||||
pivot_long_to_wide(
|
||||
df,
|
||||
index_column='Record ID',
|
||||
pivot_column='Event Name',
|
||||
value_columns=['FMA得分', 'ADL得分'],
|
||||
aggfunc='first'
|
||||
)
|
||||
"""
|
||||
result = df.copy()
|
||||
|
||||
print(f'原始数据: {len(result)} 行 × {len(result.columns)} 列')
|
||||
print(f'索引列: {index_column}')
|
||||
print(f'透视列: {pivot_column}')
|
||||
print(f'值列: {", ".join(value_columns)}')
|
||||
print(f'聚合方式: {aggfunc}')
|
||||
print('')
|
||||
print(f'━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━', flush=True)
|
||||
print(f'📊 Pivot转换', flush=True)
|
||||
print(f'原始数据: {len(result)} 行 × {len(result.columns)} 列', flush=True)
|
||||
print(f'索引列: {index_column}', flush=True)
|
||||
print(f'透视列: {pivot_column}', flush=True)
|
||||
print(f'值列: {", ".join(value_columns)}', flush=True)
|
||||
print(f'聚合方式: {aggfunc}', flush=True)
|
||||
|
||||
# ✨ 检测未选择的列
|
||||
all_columns = set(result.columns)
|
||||
used_columns = {index_column, pivot_column} | set(value_columns)
|
||||
unused_columns = list(all_columns - used_columns)
|
||||
|
||||
if unused_columns:
|
||||
print(f'', flush=True)
|
||||
print(f'📋 未选择的列({len(unused_columns)}个): {", ".join(unused_columns[:5])}{"..." if len(unused_columns) > 5 else ""}', flush=True)
|
||||
if keep_unused_columns:
|
||||
print(f'✓ 将保留未选择的列(聚合方式: {unused_agg_method})', flush=True)
|
||||
else:
|
||||
print(f'⚠️ 这些列将不会保留在结果中', flush=True)
|
||||
|
||||
print('', flush=True)
|
||||
|
||||
# 验证列是否存在
|
||||
required_cols = [index_column, pivot_column] + value_columns
|
||||
@@ -61,67 +74,211 @@ def pivot_long_to_wide(
|
||||
|
||||
# 检查索引列的唯一值数量
|
||||
unique_index = result[index_column].nunique()
|
||||
print(f'唯一{index_column}数量: {unique_index}')
|
||||
print(f'✓ 唯一{index_column}数量: {unique_index}', flush=True)
|
||||
|
||||
# 检查透视列的唯一值
|
||||
unique_pivot = result[pivot_column].unique()
|
||||
print(f'透视列"{pivot_column}"的唯一值: {list(unique_pivot)}')
|
||||
print('')
|
||||
# 检查透视列的唯一值(重要!)
|
||||
unique_pivot_values = result[pivot_column].unique()
|
||||
print(f'✓ 透视列"{pivot_column}"的唯一值: {list(unique_pivot_values)}', flush=True)
|
||||
print(f'✓ 唯一值数量: {len(unique_pivot_values)}', flush=True)
|
||||
|
||||
# ⚠️ 关键检查:如果唯一值只有1个,警告用户
|
||||
if len(unique_pivot_values) == 1:
|
||||
print(f'', flush=True)
|
||||
print(f'⚠️ 警告: 透视列只有1个唯一值!', flush=True)
|
||||
print(f' 这意味着Pivot后只会生成1列(而不是多列)', flush=True)
|
||||
print(f' 请检查:', flush=True)
|
||||
print(f' 1. 透视列是否选择正确?', flush=True)
|
||||
print(f' 2. 数据是否已经是宽表格式?', flush=True)
|
||||
print(f'', flush=True)
|
||||
|
||||
print('', flush=True)
|
||||
|
||||
try:
|
||||
# 执行Pivot转换
|
||||
# ✅ 执行Pivot转换(dropna=False保留全NaN的列)
|
||||
df_pivot = result.pivot_table(
|
||||
index=index_column,
|
||||
columns=pivot_column,
|
||||
values=value_columns,
|
||||
aggfunc=aggfunc
|
||||
aggfunc=aggfunc,
|
||||
dropna=False # ✨ 关键:不删除全NaN的列,确保所有组合都生成
|
||||
)
|
||||
|
||||
# ✨ 增强:展平多级列名(处理特殊字符)
|
||||
# 如果只有一个值列,列名是单层的
|
||||
print(f'✓ Pivot执行成功', flush=True)
|
||||
print(f' Pivot后shape: {df_pivot.shape}', flush=True)
|
||||
print(f' 列数: {len(df_pivot.columns)}', flush=True)
|
||||
print(f'', flush=True)
|
||||
|
||||
# ✨ 修复:更健壮的列名展平逻辑
|
||||
if len(value_columns) == 1:
|
||||
# 清理列名中的特殊字符,使用安全的分隔符
|
||||
value_col_clean = str(value_columns[0]).replace('(', '').replace(')', '').replace('=', '').strip()
|
||||
df_pivot.columns = [f'{value_col_clean}___{str(col).replace(" ", "_")}' for col in df_pivot.columns]
|
||||
else:
|
||||
# 多个值列,列名是多层的,需要展平
|
||||
# 使用三个下划线作为分隔符(避免与列名中的下划线冲突)
|
||||
# 单个值列:列名是单层的 (pivot_value1, pivot_value2, ...)
|
||||
print(f'📝 单值列模式:展平列名', flush=True)
|
||||
|
||||
# 获取原始值列名(用于生成新列名)
|
||||
value_col_name = value_columns[0]
|
||||
|
||||
# 生成新列名:值列名___透视值
|
||||
new_columns = []
|
||||
for col in df_pivot.columns.values:
|
||||
if isinstance(col, tuple):
|
||||
# 清理每个部分的特殊字符
|
||||
parts = [str(c).replace('(', '').replace(')', '').replace('=', '').strip() for c in col]
|
||||
new_col = '___'.join(parts)
|
||||
else:
|
||||
new_col = str(col).replace('(', '').replace(')', '').replace('=', '').strip()
|
||||
new_columns.append(new_col)
|
||||
for col in df_pivot.columns:
|
||||
# col 是透视列的某个值(如 0, 1, 2)
|
||||
new_col_name = f'{value_col_name}___{col}'
|
||||
new_columns.append(new_col_name)
|
||||
print(f' 生成列: {new_col_name}', flush=True)
|
||||
|
||||
df_pivot.columns = new_columns
|
||||
|
||||
else:
|
||||
# 多个值列:列名是多层的 ((value_col1, pivot_val1), (value_col1, pivot_val2), ...)
|
||||
print(f'📝 多值列模式:展平多级列名', flush=True)
|
||||
|
||||
new_columns = []
|
||||
for col in df_pivot.columns:
|
||||
if isinstance(col, tuple):
|
||||
# 元组:(值列名, 透视值)
|
||||
value_name, pivot_value = col
|
||||
new_col_name = f'{value_name}___{pivot_value}'
|
||||
new_columns.append(new_col_name)
|
||||
print(f' {col} → {new_col_name}', flush=True)
|
||||
else:
|
||||
# 单个值(不应该出现,但防御性编程)
|
||||
new_columns.append(str(col))
|
||||
|
||||
df_pivot.columns = new_columns
|
||||
|
||||
print(f'', flush=True)
|
||||
print(f'✓ 列名展平完成', flush=True)
|
||||
print(f'', flush=True)
|
||||
|
||||
# 重置索引(将index列变回普通列)
|
||||
df_pivot = df_pivot.reset_index()
|
||||
|
||||
# ✨ 优化:保持原始行顺序(按照index_column排序)
|
||||
# 获取原始数据中index_column的顺序
|
||||
# ✨ 新功能:保留未选择的列
|
||||
if keep_unused_columns and unused_columns:
|
||||
print(f'', flush=True)
|
||||
print(f'📦 正在处理未选择的列...', flush=True)
|
||||
|
||||
# 对未选择的列进行聚合
|
||||
if unused_agg_method == 'first':
|
||||
# 取第一个非空值
|
||||
unused_df = result.groupby(index_column)[unused_columns].first().reset_index()
|
||||
print(f'✓ 聚合方式:取第一个值', flush=True)
|
||||
|
||||
elif unused_agg_method == 'mode':
|
||||
# 取众数
|
||||
def get_mode(x):
|
||||
mode_vals = x.mode()
|
||||
return mode_vals[0] if len(mode_vals) > 0 else None
|
||||
|
||||
unused_df = result.groupby(index_column)[unused_columns].agg(get_mode).reset_index()
|
||||
print(f'✓ 聚合方式:取众数', flush=True)
|
||||
|
||||
elif unused_agg_method == 'mean':
|
||||
# 取均值(区分数值列和非数值列)
|
||||
numeric_cols = [col for col in unused_columns if pd.api.types.is_numeric_dtype(result[col])]
|
||||
non_numeric_cols = [col for col in unused_columns if col not in numeric_cols]
|
||||
|
||||
# 数值列取均值
|
||||
if numeric_cols:
|
||||
numeric_df = result.groupby(index_column)[numeric_cols].mean()
|
||||
else:
|
||||
numeric_df = pd.DataFrame(index=result[index_column].unique())
|
||||
|
||||
# 非数值列取第一个值
|
||||
if non_numeric_cols:
|
||||
non_numeric_df = result.groupby(index_column)[non_numeric_cols].first()
|
||||
else:
|
||||
non_numeric_df = pd.DataFrame(index=result[index_column].unique())
|
||||
|
||||
# 合并
|
||||
unused_df = pd.concat([numeric_df, non_numeric_df], axis=1).reset_index()
|
||||
print(f'✓ 聚合方式:数值列取均值,非数值列取第一个值', flush=True)
|
||||
|
||||
else:
|
||||
# 默认取第一个值
|
||||
unused_df = result.groupby(index_column)[unused_columns].first().reset_index()
|
||||
|
||||
# 合并到pivot结果中
|
||||
df_pivot = df_pivot.merge(unused_df, on=index_column, how='left')
|
||||
|
||||
print(f'✓ 已保留 {len(unused_columns)} 个未选择的列', flush=True)
|
||||
for col in unused_columns[:5]:
|
||||
print(f' • {col}', flush=True)
|
||||
if len(unused_columns) > 5:
|
||||
print(f' • ... 还有 {len(unused_columns) - 5} 列', flush=True)
|
||||
|
||||
# ✨ 优化:保持原始行顺序
|
||||
original_order = result[index_column].drop_duplicates().tolist()
|
||||
# 创建排序映射
|
||||
order_map = {val: idx for idx, val in enumerate(original_order)}
|
||||
# 添加临时排序列
|
||||
df_pivot['_sort_order'] = df_pivot[index_column].map(order_map)
|
||||
# 按原始顺序排序
|
||||
df_pivot = df_pivot.sort_values('_sort_order').drop(columns=['_sort_order']).reset_index(drop=True)
|
||||
|
||||
print(f'转换成功!')
|
||||
print(f'结果: {len(df_pivot)} 行 × {len(df_pivot.columns)} 列')
|
||||
print(f'新增列: {len(df_pivot.columns) - 1} 列')
|
||||
print('')
|
||||
# ✨ 新增:保持原始列顺序
|
||||
if original_column_order:
|
||||
print(f'', flush=True)
|
||||
print(f'🔄 按原始列顺序重排列...', flush=True)
|
||||
|
||||
# ✅ 关键:一次遍历,逐列判断(转置列展开,未选择列保持)
|
||||
final_cols = [index_column]
|
||||
|
||||
for orig_col in original_column_order:
|
||||
if orig_col == index_column or orig_col == pivot_column:
|
||||
continue # 跳过索引列和透视列
|
||||
|
||||
if orig_col in value_columns:
|
||||
# ✅ 这个列被选择转置 → 添加展开后的所有列
|
||||
related_cols = [c for c in df_pivot.columns if c.startswith(f'{orig_col}___')]
|
||||
|
||||
# ✨ 按透视列的原始顺序排序(而不是字母顺序)
|
||||
if pivot_value_order:
|
||||
# 创建顺序映射
|
||||
pivot_order_map = {val: idx for idx, val in enumerate(pivot_value_order)}
|
||||
|
||||
# 对related_cols按透视值顺序排序
|
||||
def get_pivot_value(col_name):
|
||||
# 从 "FMA___基线" 提取 "基线"
|
||||
parts = col_name.split('___')
|
||||
if len(parts) == 2:
|
||||
return parts[1]
|
||||
return col_name
|
||||
|
||||
related_cols_sorted = sorted(
|
||||
related_cols,
|
||||
key=lambda c: pivot_order_map.get(get_pivot_value(c), 999)
|
||||
)
|
||||
else:
|
||||
# 如果没有提供透视值顺序,保持现有顺序
|
||||
related_cols_sorted = sorted(related_cols)
|
||||
|
||||
final_cols.extend(related_cols_sorted)
|
||||
print(f' • {orig_col} → {len(related_cols_sorted)}个转置列', flush=True)
|
||||
|
||||
elif keep_unused_columns and orig_col in df_pivot.columns:
|
||||
# ✅ 这个列未被选择 → 如果保留,直接添加
|
||||
final_cols.append(orig_col)
|
||||
print(f' • {orig_col} → 保持不变', flush=True)
|
||||
|
||||
# 添加任何剩余的列(防御性编程)
|
||||
for col in df_pivot.columns:
|
||||
if col not in final_cols:
|
||||
final_cols.append(col)
|
||||
print(f' • {col} → 剩余列', flush=True)
|
||||
|
||||
# 重排列
|
||||
df_pivot = df_pivot[final_cols]
|
||||
print(f'✓ 列顺序已按原始顺序重排(总计{len(final_cols)}列)', flush=True)
|
||||
|
||||
# 显示新列名
|
||||
print(f'生成的列名:')
|
||||
print(f'✅ 转换成功!', flush=True)
|
||||
print(f'📊 结果: {len(df_pivot)} 行 × {len(df_pivot.columns)} 列', flush=True)
|
||||
print(f'📈 新增列: {len(df_pivot.columns) - 1} 列', flush=True)
|
||||
print(f'', flush=True)
|
||||
|
||||
# 显示所有新列名
|
||||
print(f'📋 生成的列名:', flush=True)
|
||||
new_cols = [col for col in df_pivot.columns if col != index_column]
|
||||
for i, col in enumerate(new_cols[:10], 1): # 只显示前10个
|
||||
print(f' {i}. {col}')
|
||||
if len(new_cols) > 10:
|
||||
print(f' ... 还有 {len(new_cols) - 10} 列')
|
||||
for i, col in enumerate(new_cols, 1):
|
||||
print(f' {i}. {col}', flush=True)
|
||||
|
||||
print(f'━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━', flush=True)
|
||||
print(f'', flush=True)
|
||||
|
||||
return df_pivot
|
||||
|
||||
@@ -180,5 +337,3 @@ def get_pivot_preview(
|
||||
'estimated_rows': int(unique_index),
|
||||
'estimated_columns': len(unique_pivot)
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -5,7 +5,8 @@
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import Dict, Any, Optional
|
||||
import numpy as np
|
||||
from typing import Dict, Any, Optional, Literal
|
||||
|
||||
|
||||
def apply_recode(
|
||||
@@ -13,7 +14,9 @@ def apply_recode(
|
||||
column: str,
|
||||
mapping: Dict[Any, Any],
|
||||
create_new_column: bool = True,
|
||||
new_column_name: Optional[str] = None
|
||||
new_column_name: Optional[str] = None,
|
||||
na_handling: Literal['keep', 'map', 'drop'] = 'keep',
|
||||
na_value: Any = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
应用数值映射
|
||||
@@ -24,16 +27,21 @@ def apply_recode(
|
||||
mapping: 映射字典,如 {'男': 1, '女': 2}
|
||||
create_new_column: 是否创建新列(True)或覆盖原列(False)
|
||||
new_column_name: 新列名(create_new_column=True时使用)
|
||||
na_handling: NA值处理方式
|
||||
- 'keep': 保持为NA(默认)
|
||||
- 'map': 映射为指定值
|
||||
- 'drop': 删除包含NA的行
|
||||
na_value: 当na_handling='map'时,NA映射到的值
|
||||
|
||||
Returns:
|
||||
重编码后的数据框
|
||||
|
||||
Examples:
|
||||
>>> df = pd.DataFrame({'性别': ['男', '女', '男', '女']})
|
||||
>>> df = pd.DataFrame({'性别': ['男', '女', '男', None]})
|
||||
>>> mapping = {'男': 1, '女': 2}
|
||||
>>> result = apply_recode(df, '性别', mapping, True, '性别_编码')
|
||||
>>> result = apply_recode(df, '性别', mapping, True, '性别_编码', na_handling='map', na_value=0)
|
||||
>>> result['性别_编码'].tolist()
|
||||
[1, 2, 1, 2]
|
||||
[1, 2, 1, 0]
|
||||
"""
|
||||
if df.empty:
|
||||
return df
|
||||
@@ -54,6 +62,9 @@ def apply_recode(
|
||||
# 创建结果数据框(避免修改原数据)
|
||||
result = df.copy()
|
||||
|
||||
# ✨ 统计原始NA数量
|
||||
original_na_count = result[column].isna().sum()
|
||||
|
||||
# ✨ 优化:如果是创建新列,插入到原列旁边
|
||||
if create_new_column:
|
||||
original_col_index = result.columns.get_loc(column)
|
||||
@@ -62,6 +73,26 @@ def apply_recode(
|
||||
# 覆盖原列
|
||||
result[target_column] = result[column].map(mapping)
|
||||
|
||||
# ✨ 处理NA值
|
||||
if original_na_count > 0:
|
||||
na_mask = result[column].isna()
|
||||
|
||||
if na_handling == 'keep':
|
||||
# 保持为NA(已经是NA,无需操作)
|
||||
print(f'📊 NA处理:保持为NA({original_na_count}个)')
|
||||
|
||||
elif na_handling == 'map':
|
||||
# 映射为指定值
|
||||
result.loc[na_mask, target_column] = na_value
|
||||
print(f'📊 NA处理:映射为 {na_value}({original_na_count}个)')
|
||||
|
||||
elif na_handling == 'drop':
|
||||
# 删除包含NA的行
|
||||
rows_before = len(result)
|
||||
result = result[~na_mask].copy()
|
||||
rows_after = len(result)
|
||||
print(f'📊 NA处理:删除包含NA的行(删除{rows_before - rows_after}行)')
|
||||
|
||||
# 统计结果
|
||||
mapped_count = result[target_column].notna().sum()
|
||||
unmapped_count = result[target_column].isna().sum()
|
||||
|
||||
Reference in New Issue
Block a user