fix(dc/tool-c): Fix special character handling and improve UX
Major fixes: - Fix pivot transformation with special characters in column names - Fix compute column validation for Chinese punctuation - Fix recode dialog to fetch unique values from full dataset via new API - Add column mapping mechanism to handle special characters Database migration: - Add column_mapping field to dc_tool_c_sessions table - Migration file: 20251208_add_column_mapping UX improvements: - Darken table grid lines for better visibility - Reduce column width by 40% with tooltip support - Insert new columns next to source columns - Preserve original row order after operations - Add notice about 50-row preview limit Modified files: - Backend: SessionService, SessionController, QuickActionService, routes - Python: pivot.py, compute.py, recode.py, binning.py, conditional.py - Frontend: DataGrid, RecodeDialog, index.tsx, ag-grid-custom.css - Database: schema.prisma, migration SQL Status: Code complete, database migrated, ready for testing
This commit is contained in:
@@ -14,3 +14,4 @@
|
||||
|
||||
__version__ = '1.0.0'
|
||||
|
||||
|
||||
|
||||
@@ -136,6 +136,15 @@ def apply_binning(
|
||||
else:
|
||||
raise ValueError(f"不支持的分箱方法: {method}")
|
||||
|
||||
# ✨ 优化:将新列移到原列旁边
|
||||
original_col_index = result.columns.get_loc(column)
|
||||
cols = list(result.columns)
|
||||
# 移除新列(当前在最后)
|
||||
cols.remove(new_column_name)
|
||||
# 插入到原列旁边
|
||||
cols.insert(original_col_index + 1, new_column_name)
|
||||
result = result[cols]
|
||||
|
||||
# 统计分布
|
||||
print(f'分箱结果分布:')
|
||||
value_counts = result[new_column_name].value_counts().sort_index()
|
||||
|
||||
@@ -59,10 +59,13 @@ def validate_formula(formula: str, available_columns: list) -> tuple[bool, str]:
|
||||
if re.search(pattern, formula, re.IGNORECASE):
|
||||
return False, f'公式包含不允许的操作: {pattern}'
|
||||
|
||||
# 检查是否只包含允许的字符
|
||||
allowed_chars = r'[a-zA-Z0-9_\u4e00-\u9fa5\s\+\-\*/\(\)\.,\*\*]'
|
||||
# ✨ 增强:检查是否只包含允许的字符(放宽限制,支持更多特殊字符)
|
||||
# 允许:英文字母、数字、下划线、中文、空格、运算符、括号(中英文)、逗号、点、冒号、等号
|
||||
allowed_chars = r'[a-zA-Z0-9_\u4e00-\u9fa5\s\+\-\*/\(\)\[\]\{\}\.,:\*\*=()【】、。:;!?]'
|
||||
if not re.match(f'^{allowed_chars}+$', formula):
|
||||
return False, '公式包含不允许的字符'
|
||||
# 找出不允许的字符
|
||||
invalid_chars = set(re.findall(f'[^{allowed_chars}]', formula))
|
||||
return False, f'公式包含不允许的字符: {", ".join(invalid_chars)}'
|
||||
|
||||
return True, ''
|
||||
|
||||
@@ -110,21 +113,41 @@ def compute_column(
|
||||
# 准备执行环境
|
||||
# 1. 添加数据框的列作为变量(自动转换数值类型)
|
||||
env = {}
|
||||
for col in result.columns:
|
||||
|
||||
# ✨ 增强:处理列名中的特殊字符
|
||||
# 创建列名映射:将公式中的列名替换为安全的变量名
|
||||
col_mapping = {}
|
||||
formula_safe = formula
|
||||
|
||||
for i, col in enumerate(result.columns):
|
||||
# 为每个列创建一个安全的变量名
|
||||
safe_var = f'col_{i}'
|
||||
col_mapping[col] = safe_var
|
||||
|
||||
# 在公式中替换列名(完整匹配,避免部分替换)
|
||||
# 使用正则表达式确保只替换完整的列名
|
||||
import re
|
||||
# 转义列名中的特殊字符
|
||||
col_escaped = re.escape(col)
|
||||
# 替换公式中的列名(前后必须是边界)
|
||||
formula_safe = re.sub(rf'\b{col_escaped}\b', safe_var, formula_safe)
|
||||
|
||||
# 尝试将列转换为数值类型
|
||||
try:
|
||||
# 如果列可以转换为数值,就转换
|
||||
numeric_col = pd.to_numeric(result[col], errors='coerce')
|
||||
# 如果转换后不全是NaN,说明是数值列
|
||||
if not numeric_col.isna().all():
|
||||
env[col] = numeric_col
|
||||
print(f' 列 "{col}" 自动转换为数值类型')
|
||||
env[safe_var] = numeric_col
|
||||
print(f' 列 "{col}" -> {safe_var} (数值类型)')
|
||||
else:
|
||||
# 否则保持原样
|
||||
env[col] = result[col]
|
||||
env[safe_var] = result[col]
|
||||
print(f' 列 "{col}" -> {safe_var}')
|
||||
except Exception:
|
||||
# 转换失败,保持原样
|
||||
env[col] = result[col]
|
||||
env[safe_var] = result[col]
|
||||
print(f' 列 "{col}" -> {safe_var}')
|
||||
|
||||
# 2. 添加允许的函数
|
||||
env.update(ALLOWED_FUNCTIONS)
|
||||
@@ -132,11 +155,30 @@ def compute_column(
|
||||
# 3. 添加numpy(用于数学运算)
|
||||
env['np'] = np
|
||||
|
||||
print(f' 使用安全公式: {formula_safe}')
|
||||
print('')
|
||||
|
||||
try:
|
||||
# 执行公式计算
|
||||
result[new_column_name] = eval(formula, {"__builtins__": {}}, env)
|
||||
# ✨ 使用转换后的安全公式执行计算
|
||||
computed_values = eval(formula_safe, {"__builtins__": {}}, env)
|
||||
|
||||
print(f'计算成功!')
|
||||
# ✨ 优化:将新列插入到第一个引用列的旁边
|
||||
# 找到公式中引用的第一个列
|
||||
first_ref_col = None
|
||||
for col in result.columns:
|
||||
safe_var = col_mapping.get(col)
|
||||
if safe_var and safe_var in formula_safe:
|
||||
first_ref_col = col
|
||||
break
|
||||
|
||||
if first_ref_col:
|
||||
ref_col_index = result.columns.get_loc(first_ref_col)
|
||||
result.insert(ref_col_index + 1, new_column_name, computed_values)
|
||||
print(f'计算成功!新列插入在 {first_ref_col} 旁边')
|
||||
else:
|
||||
# 如果找不到引用列,添加到最后
|
||||
result[new_column_name] = computed_values
|
||||
print(f'计算成功!')
|
||||
print(f'新列类型: {result[new_column_name].dtype}')
|
||||
print(f'新列前5个值:')
|
||||
# 安全打印(避免NaN/inf导致序列化错误)
|
||||
|
||||
@@ -128,6 +128,16 @@ def apply_conditional_column(
|
||||
|
||||
print(f' 规则{rule_idx}: 匹配 {matched_count} 行 → 值为 {result_value}')
|
||||
|
||||
# ✨ 优化:将新列移到第一个引用列旁边
|
||||
first_ref_col = rules[0]['conditions'][0]['column'] # 使用第一个规则的第一个条件列作为参考
|
||||
original_col_index = result.columns.get_loc(first_ref_col)
|
||||
cols = list(result.columns)
|
||||
# 移除新列(当前在最后)
|
||||
cols.remove(new_column_name)
|
||||
# 插入到原列旁边
|
||||
cols.insert(original_col_index + 1, new_column_name)
|
||||
result = result[cols]
|
||||
|
||||
# 统计结果分布
|
||||
print(f'\n结果分布:')
|
||||
value_counts = result[new_column_name].value_counts(dropna=False)
|
||||
|
||||
@@ -147,3 +147,4 @@ def get_missing_summary(df: pd.DataFrame) -> dict:
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -107,3 +107,4 @@ def apply_filter(
|
||||
|
||||
return result
|
||||
|
||||
|
||||
|
||||
@@ -77,17 +77,39 @@ def pivot_long_to_wide(
|
||||
aggfunc=aggfunc
|
||||
)
|
||||
|
||||
# 展平多级列名
|
||||
# ✨ 增强:展平多级列名(处理特殊字符)
|
||||
# 如果只有一个值列,列名是单层的
|
||||
if len(value_columns) == 1:
|
||||
df_pivot.columns = [f'{value_columns[0]}_{col}' for col in df_pivot.columns]
|
||||
# 清理列名中的特殊字符,使用安全的分隔符
|
||||
value_col_clean = str(value_columns[0]).replace('(', '').replace(')', '').replace('=', '').strip()
|
||||
df_pivot.columns = [f'{value_col_clean}___{str(col).replace(" ", "_")}' for col in df_pivot.columns]
|
||||
else:
|
||||
# 多个值列,列名是多层的,需要展平
|
||||
df_pivot.columns = ['_'.join(str(c) for c in col).strip() for col in df_pivot.columns.values]
|
||||
# 使用三个下划线作为分隔符(避免与列名中的下划线冲突)
|
||||
new_columns = []
|
||||
for col in df_pivot.columns.values:
|
||||
if isinstance(col, tuple):
|
||||
# 清理每个部分的特殊字符
|
||||
parts = [str(c).replace('(', '').replace(')', '').replace('=', '').strip() for c in col]
|
||||
new_col = '___'.join(parts)
|
||||
else:
|
||||
new_col = str(col).replace('(', '').replace(')', '').replace('=', '').strip()
|
||||
new_columns.append(new_col)
|
||||
df_pivot.columns = new_columns
|
||||
|
||||
# 重置索引(将index列变回普通列)
|
||||
df_pivot = df_pivot.reset_index()
|
||||
|
||||
# ✨ 优化:保持原始行顺序(按照index_column排序)
|
||||
# 获取原始数据中index_column的顺序
|
||||
original_order = result[index_column].drop_duplicates().tolist()
|
||||
# 创建排序映射
|
||||
order_map = {val: idx for idx, val in enumerate(original_order)}
|
||||
# 添加临时排序列
|
||||
df_pivot['_sort_order'] = df_pivot[index_column].map(order_map)
|
||||
# 按原始顺序排序
|
||||
df_pivot = df_pivot.sort_values('_sort_order').drop(columns=['_sort_order']).reset_index(drop=True)
|
||||
|
||||
print(f'转换成功!')
|
||||
print(f'结果: {len(df_pivot)} 行 × {len(df_pivot.columns)} 列')
|
||||
print(f'新增列: {len(df_pivot.columns) - 1} 列')
|
||||
@@ -159,3 +181,4 @@ def get_pivot_preview(
|
||||
'estimated_columns': len(unique_pivot)
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -54,8 +54,13 @@ def apply_recode(
|
||||
# 创建结果数据框(避免修改原数据)
|
||||
result = df.copy()
|
||||
|
||||
# 应用映射
|
||||
result[target_column] = result[column].map(mapping)
|
||||
# ✨ 优化:如果是创建新列,插入到原列旁边
|
||||
if create_new_column:
|
||||
original_col_index = result.columns.get_loc(column)
|
||||
result.insert(original_col_index + 1, target_column, result[column].map(mapping))
|
||||
else:
|
||||
# 覆盖原列
|
||||
result[target_column] = result[column].map(mapping)
|
||||
|
||||
# 统计结果
|
||||
mapped_count = result[target_column].notna().sum()
|
||||
@@ -77,3 +82,4 @@ def apply_recode(
|
||||
|
||||
return result
|
||||
|
||||
|
||||
|
||||
@@ -281,3 +281,4 @@ if __name__ == "__main__":
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -47,3 +47,4 @@ except Exception as e:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -27,3 +27,4 @@ except Exception as e:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user