fix(dc/tool-c): Fix special character handling and improve UX

Major fixes:
- Fix pivot transformation with special characters in column names
- Fix compute column validation for Chinese punctuation
- Fix recode dialog to fetch unique values from full dataset via new API
- Add column mapping mechanism to handle special characters

Database migration:
- Add column_mapping field to dc_tool_c_sessions table
- Migration file: 20251208_add_column_mapping

UX improvements:
- Darken table grid lines for better visibility
- Reduce column width by 40% with tooltip support
- Insert new columns next to source columns
- Preserve original row order after operations
- Add notice about 50-row preview limit

Modified files:
- Backend: SessionService, SessionController, QuickActionService, routes
- Python: pivot.py, compute.py, recode.py, binning.py, conditional.py
- Frontend: DataGrid, RecodeDialog, index.tsx, ag-grid-custom.css
- Database: schema.prisma, migration SQL

Status: Code complete, database migrated, ready for testing
This commit is contained in:
2025-12-08 23:20:55 +08:00
parent f729699510
commit 91cab452d1
90 changed files with 735 additions and 45 deletions

View File

@@ -14,3 +14,4 @@
__version__ = '1.0.0'

View File

@@ -136,6 +136,15 @@ def apply_binning(
else:
raise ValueError(f"不支持的分箱方法: {method}")
# ✨ 优化:将新列移到原列旁边
original_col_index = result.columns.get_loc(column)
cols = list(result.columns)
# 移除新列(当前在最后)
cols.remove(new_column_name)
# 插入到原列旁边
cols.insert(original_col_index + 1, new_column_name)
result = result[cols]
# 统计分布
print(f'分箱结果分布:')
value_counts = result[new_column_name].value_counts().sort_index()

View File

@@ -59,10 +59,13 @@ def validate_formula(formula: str, available_columns: list) -> tuple[bool, str]:
if re.search(pattern, formula, re.IGNORECASE):
return False, f'公式包含不允许的操作: {pattern}'
# 检查是否只包含允许的字符
allowed_chars = r'[a-zA-Z0-9_\u4e00-\u9fa5\s\+\-\*/\(\)\.,\*\*]'
# ✨ 增强:检查是否只包含允许的字符(放宽限制,支持更多特殊字符)
# 允许:英文字母、数字、下划线、中文、空格、运算符、括号(中英文)、逗号、点、冒号、等号
allowed_chars = r'[a-zA-Z0-9_\u4e00-\u9fa5\s\+\-\*/\(\)\[\]\{\}\.,:\*\*=()【】、。:;!?]'
if not re.match(f'^{allowed_chars}+$', formula):
return False, '公式包含不允许的字符'
# 找出不允许的字符
invalid_chars = set(re.findall(f'[^{allowed_chars}]', formula))
return False, f'公式包含不允许的字符: {", ".join(invalid_chars)}'
return True, ''
@@ -110,21 +113,41 @@ def compute_column(
# 准备执行环境
# 1. 添加数据框的列作为变量(自动转换数值类型)
env = {}
for col in result.columns:
# ✨ 增强:处理列名中的特殊字符
# 创建列名映射:将公式中的列名替换为安全的变量名
col_mapping = {}
formula_safe = formula
for i, col in enumerate(result.columns):
# 为每个列创建一个安全的变量名
safe_var = f'col_{i}'
col_mapping[col] = safe_var
# 在公式中替换列名(完整匹配,避免部分替换)
# 使用正则表达式确保只替换完整的列名
import re
# 转义列名中的特殊字符
col_escaped = re.escape(col)
# 替换公式中的列名(前后必须是边界)
formula_safe = re.sub(rf'\b{col_escaped}\b', safe_var, formula_safe)
# 尝试将列转换为数值类型
try:
# 如果列可以转换为数值,就转换
numeric_col = pd.to_numeric(result[col], errors='coerce')
# 如果转换后不全是NaN说明是数值列
if not numeric_col.isna().all():
env[col] = numeric_col
print(f'"{col}" 自动转换为数值类型')
env[safe_var] = numeric_col
print(f'"{col}" -> {safe_var} (数值类型)')
else:
# 否则保持原样
env[col] = result[col]
env[safe_var] = result[col]
print(f'"{col}" -> {safe_var}')
except Exception:
# 转换失败,保持原样
env[col] = result[col]
env[safe_var] = result[col]
print(f'"{col}" -> {safe_var}')
# 2. 添加允许的函数
env.update(ALLOWED_FUNCTIONS)
@@ -132,11 +155,30 @@ def compute_column(
# 3. 添加numpy用于数学运算
env['np'] = np
print(f' 使用安全公式: {formula_safe}')
print('')
try:
# 执行公式计算
result[new_column_name] = eval(formula, {"__builtins__": {}}, env)
# ✨ 使用转换后的安全公式执行计算
computed_values = eval(formula_safe, {"__builtins__": {}}, env)
print(f'计算成功!')
# ✨ 优化:将新列插入到第一个引用列的旁边
# 找到公式中引用的第一个列
first_ref_col = None
for col in result.columns:
safe_var = col_mapping.get(col)
if safe_var and safe_var in formula_safe:
first_ref_col = col
break
if first_ref_col:
ref_col_index = result.columns.get_loc(first_ref_col)
result.insert(ref_col_index + 1, new_column_name, computed_values)
print(f'计算成功!新列插入在 {first_ref_col} 旁边')
else:
# 如果找不到引用列,添加到最后
result[new_column_name] = computed_values
print(f'计算成功!')
print(f'新列类型: {result[new_column_name].dtype}')
print(f'新列前5个值:')
# 安全打印避免NaN/inf导致序列化错误

View File

@@ -128,6 +128,16 @@ def apply_conditional_column(
print(f' 规则{rule_idx}: 匹配 {matched_count} 行 → 值为 {result_value}')
# ✨ 优化:将新列移到第一个引用列旁边
first_ref_col = rules[0]['conditions'][0]['column'] # 使用第一个规则的第一个条件列作为参考
original_col_index = result.columns.get_loc(first_ref_col)
cols = list(result.columns)
# 移除新列(当前在最后)
cols.remove(new_column_name)
# 插入到原列旁边
cols.insert(original_col_index + 1, new_column_name)
result = result[cols]
# 统计结果分布
print(f'\n结果分布:')
value_counts = result[new_column_name].value_counts(dropna=False)

View File

@@ -147,3 +147,4 @@ def get_missing_summary(df: pd.DataFrame) -> dict:
}
}

View File

@@ -107,3 +107,4 @@ def apply_filter(
return result

View File

@@ -77,17 +77,39 @@ def pivot_long_to_wide(
aggfunc=aggfunc
)
# 展平多级列名
# ✨ 增强:展平多级列名(处理特殊字符)
# 如果只有一个值列,列名是单层的
if len(value_columns) == 1:
df_pivot.columns = [f'{value_columns[0]}_{col}' for col in df_pivot.columns]
# 清理列名中的特殊字符,使用安全的分隔符
value_col_clean = str(value_columns[0]).replace('(', '').replace(')', '').replace('=', '').strip()
df_pivot.columns = [f'{value_col_clean}___{str(col).replace(" ", "_")}' for col in df_pivot.columns]
else:
# 多个值列,列名是多层的,需要展平
df_pivot.columns = ['_'.join(str(c) for c in col).strip() for col in df_pivot.columns.values]
# 使用三个下划线作为分隔符(避免与列名中的下划线冲突)
new_columns = []
for col in df_pivot.columns.values:
if isinstance(col, tuple):
# 清理每个部分的特殊字符
parts = [str(c).replace('(', '').replace(')', '').replace('=', '').strip() for c in col]
new_col = '___'.join(parts)
else:
new_col = str(col).replace('(', '').replace(')', '').replace('=', '').strip()
new_columns.append(new_col)
df_pivot.columns = new_columns
# 重置索引将index列变回普通列
df_pivot = df_pivot.reset_index()
# ✨ 优化保持原始行顺序按照index_column排序
# 获取原始数据中index_column的顺序
original_order = result[index_column].drop_duplicates().tolist()
# 创建排序映射
order_map = {val: idx for idx, val in enumerate(original_order)}
# 添加临时排序列
df_pivot['_sort_order'] = df_pivot[index_column].map(order_map)
# 按原始顺序排序
df_pivot = df_pivot.sort_values('_sort_order').drop(columns=['_sort_order']).reset_index(drop=True)
print(f'转换成功!')
print(f'结果: {len(df_pivot)}× {len(df_pivot.columns)}')
print(f'新增列: {len(df_pivot.columns) - 1}')
@@ -159,3 +181,4 @@ def get_pivot_preview(
'estimated_columns': len(unique_pivot)
}

View File

@@ -54,8 +54,13 @@ def apply_recode(
# 创建结果数据框(避免修改原数据)
result = df.copy()
# 应用映射
result[target_column] = result[column].map(mapping)
# ✨ 优化:如果是创建新列,插入到原列旁边
if create_new_column:
original_col_index = result.columns.get_loc(column)
result.insert(original_col_index + 1, target_column, result[column].map(mapping))
else:
# 覆盖原列
result[target_column] = result[column].map(mapping)
# 统计结果
mapped_count = result[target_column].notna().sum()
@@ -77,3 +82,4 @@ def apply_recode(
return result

View File

@@ -281,3 +281,4 @@ if __name__ == "__main__":

View File

@@ -47,3 +47,4 @@ except Exception as e:

View File

@@ -27,3 +27,4 @@ except Exception as e: