fix(dc/tool-c): Fix special character handling and improve UX

Major fixes: - Fix pivot transformation with special characters in column names - Fix compute column validation for Chinese punctuation - Fix recode dialog to fetch unique values from full dataset via new API - Add column mapping mechanism to handle special characters Database migration: - Add column_mapping field to dc_tool_c_sessions table - Migration file: 20251208_add_column_mapping UX improvements: - Darken table grid lines for better visibility - Reduce column width by 40% with tooltip support - Insert new columns next to source columns - Preserve original row order after operations - Add notice about 50-row preview limit Modified files: - Backend: SessionService, SessionController, QuickActionService, routes - Python: pivot.py, compute.py, recode.py, binning.py, conditional.py - Frontend: DataGrid, RecodeDialog, index.tsx, ag-grid-custom.css - Database: schema.prisma, migration SQL Status: Code complete, database migrated, ready for testing
2025-12-08 23:20:55 +08:00
parent f729699510
commit 91cab452d1
90 changed files with 735 additions and 45 deletions
--- a/extraction_service/operations/init.py
+++ b/extraction_service/operations/init.py
@@ -14,3 +14,4 @@

 __version__ = '1.0.0'

+
--- a/extraction_service/operations/binning.py
+++ b/extraction_service/operations/binning.py
@@ -136,6 +136,15 @@ def apply_binning(
    else:
        raise ValueError(f"不支持的分箱方法: {method}")
    
+    # ✨ 优化：将新列移到原列旁边
+    original_col_index = result.columns.get_loc(column)
+    cols = list(result.columns)
+    # 移除新列（当前在最后）
+    cols.remove(new_column_name)
+    # 插入到原列旁边
+    cols.insert(original_col_index + 1, new_column_name)
+    result = result[cols]
+    
    # 统计分布
    print(f'分箱结果分布:')
    value_counts = result[new_column_name].value_counts().sort_index()
--- a/extraction_service/operations/compute.py
+++ b/extraction_service/operations/compute.py
@@ -59,10 +59,13 @@ def validate_formula(formula: str, available_columns: list) -> tuple[bool, str]:
        if re.search(pattern, formula, re.IGNORECASE):
            return False, f'公式包含不允许的操作: {pattern}'
    
-    # 检查是否只包含允许的字符
-    allowed_chars = r'[a-zA-Z0-9_\u4e00-\u9fa5\s\+\-\*/\(\)\.,\*\*]'
+    # ✨ 增强：检查是否只包含允许的字符（放宽限制，支持更多特殊字符）
+    # 允许：英文字母、数字、下划线、中文、空格、运算符、括号（中英文）、逗号、点、冒号、等号
+    allowed_chars = r'[a-zA-Z0-9_\u4e00-\u9fa5\s\+\-\*/\(\)\[\]\{\}\.,:\*\*=（）【】、。：；！？]'
    if not re.match(f'^{allowed_chars}+$', formula):
-        return False, '公式包含不允许的字符'
+        # 找出不允许的字符
+        invalid_chars = set(re.findall(f'[^{allowed_chars}]', formula))
+        return False, f'公式包含不允许的字符: {", ".join(invalid_chars)}'
    
    return True, ''

@@ -110,21 +113,41 @@ def compute_column(
    # 准备执行环境
    # 1. 添加数据框的列作为变量（自动转换数值类型）
    env = {}
-    for col in result.columns:
+    
+    # ✨ 增强：处理列名中的特殊字符
+    # 创建列名映射：将公式中的列名替换为安全的变量名
+    col_mapping = {}
+    formula_safe = formula
+    
+    for i, col in enumerate(result.columns):
+        # 为每个列创建一个安全的变量名
+        safe_var = f'col_{i}'
+        col_mapping[col] = safe_var
+        
+        # 在公式中替换列名（完整匹配，避免部分替换）
+        # 使用正则表达式确保只替换完整的列名
+        import re
+        # 转义列名中的特殊字符
+        col_escaped = re.escape(col)
+        # 替换公式中的列名（前后必须是边界）
+        formula_safe = re.sub(rf'\b{col_escaped}\b', safe_var, formula_safe)
+        
        # 尝试将列转换为数值类型
        try:
            # 如果列可以转换为数值，就转换
            numeric_col = pd.to_numeric(result[col], errors='coerce')
            # 如果转换后不全是NaN，说明是数值列
            if not numeric_col.isna().all():
-                env[col] = numeric_col
-                print(f'  列 "{col}" 自动转换为数值类型')
+                env[safe_var] = numeric_col
+                print(f'  列 "{col}" -> {safe_var} (数值类型)')
            else:
                # 否则保持原样
-                env[col] = result[col]
+                env[safe_var] = result[col]
+                print(f'  列 "{col}" -> {safe_var}')
        except Exception:
            # 转换失败，保持原样
-            env[col] = result[col]
+            env[safe_var] = result[col]
+            print(f'  列 "{col}" -> {safe_var}')
    
    # 2. 添加允许的函数
    env.update(ALLOWED_FUNCTIONS)
@@ -132,11 +155,30 @@ def compute_column(
    # 3. 添加numpy（用于数学运算）
    env['np'] = np
    
+    print(f'  使用安全公式: {formula_safe}')
+    print('')
+    
    try:
-        # 执行公式计算
-        result[new_column_name] = eval(formula, {"__builtins__": {}}, env)
+        # ✨ 使用转换后的安全公式执行计算
+        computed_values = eval(formula_safe, {"__builtins__": {}}, env)
        
-        print(f'计算成功！')
+        # ✨ 优化：将新列插入到第一个引用列的旁边
+        # 找到公式中引用的第一个列
+        first_ref_col = None
+        for col in result.columns:
+            safe_var = col_mapping.get(col)
+            if safe_var and safe_var in formula_safe:
+                first_ref_col = col
+                break
+        
+        if first_ref_col:
+            ref_col_index = result.columns.get_loc(first_ref_col)
+            result.insert(ref_col_index + 1, new_column_name, computed_values)
+            print(f'计算成功！新列插入在 {first_ref_col} 旁边')
+        else:
+            # 如果找不到引用列，添加到最后
+            result[new_column_name] = computed_values
+            print(f'计算成功！')
        print(f'新列类型: {result[new_column_name].dtype}')
        print(f'新列前5个值:')
        # 安全打印（避免NaN/inf导致序列化错误）
--- a/extraction_service/operations/conditional.py
+++ b/extraction_service/operations/conditional.py
@@ -128,6 +128,16 @@ def apply_conditional_column(
        
        print(f'  规则{rule_idx}: 匹配 {matched_count} 行 → 值为 {result_value}')
    
+    # ✨ 优化：将新列移到第一个引用列旁边
+    first_ref_col = rules[0]['conditions'][0]['column']  # 使用第一个规则的第一个条件列作为参考
+    original_col_index = result.columns.get_loc(first_ref_col)
+    cols = list(result.columns)
+    # 移除新列（当前在最后）
+    cols.remove(new_column_name)
+    # 插入到原列旁边
+    cols.insert(original_col_index + 1, new_column_name)
+    result = result[cols]
+    
    # 统计结果分布
    print(f'\n结果分布:')
    value_counts = result[new_column_name].value_counts(dropna=False)
--- a/extraction_service/operations/dropna.py
+++ b/extraction_service/operations/dropna.py
@@ -147,3 +147,4 @@ def get_missing_summary(df: pd.DataFrame) -> dict:
        }
    }

+
--- a/extraction_service/operations/filter.py
+++ b/extraction_service/operations/filter.py
@@ -107,3 +107,4 @@ def apply_filter(
    
    return result

+
--- a/extraction_service/operations/pivot.py
+++ b/extraction_service/operations/pivot.py
@@ -77,17 +77,39 @@ def pivot_long_to_wide(
            aggfunc=aggfunc
        )
        
-        # 展平多级列名
+        # ✨ 增强：展平多级列名（处理特殊字符）
        # 如果只有一个值列，列名是单层的
        if len(value_columns) == 1:
-            df_pivot.columns = [f'{value_columns[0]}_{col}' for col in df_pivot.columns]
+            # 清理列名中的特殊字符，使用安全的分隔符
+            value_col_clean = str(value_columns[0]).replace('(', '').replace(')', '').replace('=', '').strip()
+            df_pivot.columns = [f'{value_col_clean}___{str(col).replace(" ", "_")}' for col in df_pivot.columns]
        else:
            # 多个值列，列名是多层的，需要展平
-            df_pivot.columns = ['_'.join(str(c) for c in col).strip() for col in df_pivot.columns.values]
+            # 使用三个下划线作为分隔符（避免与列名中的下划线冲突）
+            new_columns = []
+            for col in df_pivot.columns.values:
+                if isinstance(col, tuple):
+                    # 清理每个部分的特殊字符
+                    parts = [str(c).replace('(', '').replace(')', '').replace('=', '').strip() for c in col]
+                    new_col = '___'.join(parts)
+                else:
+                    new_col = str(col).replace('(', '').replace(')', '').replace('=', '').strip()
+                new_columns.append(new_col)
+            df_pivot.columns = new_columns
        
        # 重置索引（将index列变回普通列）
        df_pivot = df_pivot.reset_index()
        
+        # ✨ 优化：保持原始行顺序（按照index_column排序）
+        # 获取原始数据中index_column的顺序
+        original_order = result[index_column].drop_duplicates().tolist()
+        # 创建排序映射
+        order_map = {val: idx for idx, val in enumerate(original_order)}
+        # 添加临时排序列
+        df_pivot['_sort_order'] = df_pivot[index_column].map(order_map)
+        # 按原始顺序排序
+        df_pivot = df_pivot.sort_values('_sort_order').drop(columns=['_sort_order']).reset_index(drop=True)
+        
        print(f'转换成功！')
        print(f'结果: {len(df_pivot)} 行 × {len(df_pivot.columns)} 列')
        print(f'新增列: {len(df_pivot.columns) - 1} 列')
@@ -159,3 +181,4 @@ def get_pivot_preview(
        'estimated_columns': len(unique_pivot)
    }

+
--- a/extraction_service/operations/recode.py
+++ b/extraction_service/operations/recode.py
@@ -54,8 +54,13 @@ def apply_recode(
    # 创建结果数据框（避免修改原数据）
    result = df.copy()
    
-    # 应用映射
-    result[target_column] = result[column].map(mapping)
+    # ✨ 优化：如果是创建新列，插入到原列旁边
+    if create_new_column:
+        original_col_index = result.columns.get_loc(column)
+        result.insert(original_col_index + 1, target_column, result[column].map(mapping))
+    else:
+        # 覆盖原列
+        result[target_column] = result[column].map(mapping)
    
    # 统计结果
    mapped_count = result[target_column].notna().sum()
@@ -77,3 +82,4 @@ def apply_recode(
    
    return result

+
--- a/extraction_service/test_dc_api.py
+++ b/extraction_service/test_dc_api.py
@@ -281,3 +281,4 @@ if __name__ == "__main__":



+
--- a/extraction_service/test_execute_simple.py
+++ b/extraction_service/test_execute_simple.py
@@ -47,3 +47,4 @@ except Exception as e:



+
--- a/extraction_service/test_module.py
+++ b/extraction_service/test_module.py
@@ -27,3 +27,4 @@ except Exception as e:



+