fix(dc/tool-c): Fix special character handling and improve UX

Major fixes: - Fix pivot transformation with special characters in column names - Fix compute column validation for Chinese punctuation - Fix recode dialog to fetch unique values from full dataset via new API - Add column mapping mechanism to handle special characters Database migration: - Add column_mapping field to dc_tool_c_sessions table - Migration file: 20251208_add_column_mapping UX improvements: - Darken table grid lines for better visibility - Reduce column width by 40% with tooltip support - Insert new columns next to source columns - Preserve original row order after operations - Add notice about 50-row preview limit Modified files: - Backend: SessionService, SessionController, QuickActionService, routes - Python: pivot.py, compute.py, recode.py, binning.py, conditional.py - Frontend: DataGrid, RecodeDialog, index.tsx, ag-grid-custom.css - Database: schema.prisma, migration SQL Status: Code complete, database migrated, ready for testing
2025-12-08 23:20:55 +08:00
parent f729699510
commit 91cab452d1
90 changed files with 735 additions and 45 deletions
--- a/extraction_service/operations/compute.py
+++ b/extraction_service/operations/compute.py
@@ -59,10 +59,13 @@ def validate_formula(formula: str, available_columns: list) -> tuple[bool, str]:
        if re.search(pattern, formula, re.IGNORECASE):
            return False, f'公式包含不允许的操作: {pattern}'
    
-    # 检查是否只包含允许的字符
-    allowed_chars = r'[a-zA-Z0-9_\u4e00-\u9fa5\s\+\-\*/\(\)\.,\*\*]'
+    # ✨ 增强：检查是否只包含允许的字符（放宽限制，支持更多特殊字符）
+    # 允许：英文字母、数字、下划线、中文、空格、运算符、括号（中英文）、逗号、点、冒号、等号
+    allowed_chars = r'[a-zA-Z0-9_\u4e00-\u9fa5\s\+\-\*/\(\)\[\]\{\}\.,:\*\*=（）【】、。：；！？]'
    if not re.match(f'^{allowed_chars}+$', formula):
-        return False, '公式包含不允许的字符'
+        # 找出不允许的字符
+        invalid_chars = set(re.findall(f'[^{allowed_chars}]', formula))
+        return False, f'公式包含不允许的字符: {", ".join(invalid_chars)}'
    
    return True, ''

@@ -110,21 +113,41 @@ def compute_column(
    # 准备执行环境
    # 1. 添加数据框的列作为变量（自动转换数值类型）
    env = {}
-    for col in result.columns:
+    
+    # ✨ 增强：处理列名中的特殊字符
+    # 创建列名映射：将公式中的列名替换为安全的变量名
+    col_mapping = {}
+    formula_safe = formula
+    
+    for i, col in enumerate(result.columns):
+        # 为每个列创建一个安全的变量名
+        safe_var = f'col_{i}'
+        col_mapping[col] = safe_var
+        
+        # 在公式中替换列名（完整匹配，避免部分替换）
+        # 使用正则表达式确保只替换完整的列名
+        import re
+        # 转义列名中的特殊字符
+        col_escaped = re.escape(col)
+        # 替换公式中的列名（前后必须是边界）
+        formula_safe = re.sub(rf'\b{col_escaped}\b', safe_var, formula_safe)
+        
        # 尝试将列转换为数值类型
        try:
            # 如果列可以转换为数值，就转换
            numeric_col = pd.to_numeric(result[col], errors='coerce')
            # 如果转换后不全是NaN，说明是数值列
            if not numeric_col.isna().all():
-                env[col] = numeric_col
-                print(f'  列 "{col}" 自动转换为数值类型')
+                env[safe_var] = numeric_col
+                print(f'  列 "{col}" -> {safe_var} (数值类型)')
            else:
                # 否则保持原样
-                env[col] = result[col]
+                env[safe_var] = result[col]
+                print(f'  列 "{col}" -> {safe_var}')
        except Exception:
            # 转换失败，保持原样
-            env[col] = result[col]
+            env[safe_var] = result[col]
+            print(f'  列 "{col}" -> {safe_var}')
    
    # 2. 添加允许的函数
    env.update(ALLOWED_FUNCTIONS)
@@ -132,11 +155,30 @@ def compute_column(
    # 3. 添加numpy（用于数学运算）
    env['np'] = np
    
+    print(f'  使用安全公式: {formula_safe}')
+    print('')
+    
    try:
-        # 执行公式计算
-        result[new_column_name] = eval(formula, {"__builtins__": {}}, env)
+        # ✨ 使用转换后的安全公式执行计算
+        computed_values = eval(formula_safe, {"__builtins__": {}}, env)
        
-        print(f'计算成功！')
+        # ✨ 优化：将新列插入到第一个引用列的旁边
+        # 找到公式中引用的第一个列
+        first_ref_col = None
+        for col in result.columns:
+            safe_var = col_mapping.get(col)
+            if safe_var and safe_var in formula_safe:
+                first_ref_col = col
+                break
+        
+        if first_ref_col:
+            ref_col_index = result.columns.get_loc(first_ref_col)
+            result.insert(ref_col_index + 1, new_column_name, computed_values)
+            print(f'计算成功！新列插入在 {first_ref_col} 旁边')
+        else:
+            # 如果找不到引用列，添加到最后
+            result[new_column_name] = computed_values
+            print(f'计算成功！')
        print(f'新列类型: {result[new_column_name].dtype}')
        print(f'新列前5个值:')
        # 安全打印（避免NaN/inf导致序列化错误）