feat(dc): Complete Tool C quick action buttons Phase 1-2 - 7 functions
Summary: - Implement 7 quick action functions (filter, recode, binning, conditional, dropna, compute, pivot) - Refactor to pre-written Python functions architecture (stable and secure) - Add 7 Python operations modules with full type hints - Add 7 frontend Dialog components with user-friendly UI - Fix NaN serialization issues and auto type conversion - Update all related documentation Technical Details: - Python: operations/ module (filter.py, recode.py, binning.py, conditional.py, dropna.py, compute.py, pivot.py) - Backend: QuickActionService.ts with 7 execute methods - Frontend: 7 Dialog components with complete validation - Toolbar: Enable 7 quick action buttons Status: Phase 1-2 completed, basic testing passed, ready for further testing
This commit is contained in:
16
python-microservice/operations/__init__.py
Normal file
16
python-microservice/operations/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
"""
|
||||
数据操作函数模块
|
||||
|
||||
提供预写的、经过测试的数据处理函数,供功能按钮调用。
|
||||
|
||||
模块列表:
|
||||
- filter: 高级筛选
|
||||
- recode: 数值映射(重编码)
|
||||
- binning: 生成分类变量(分箱)
|
||||
- conditional: 条件生成列
|
||||
- missing: 缺失值处理
|
||||
- duplicate: 去重
|
||||
"""
|
||||
|
||||
__version__ = '1.0.0'
|
||||
|
||||
123
python-microservice/operations/binning.py
Normal file
123
python-microservice/operations/binning.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""
|
||||
生成分类变量(分箱)操作
|
||||
|
||||
将连续数值变量转换为分类变量。
|
||||
支持三种方法:自定义切点、等宽分箱、等频分箱。
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import List, Optional, Literal, Union
|
||||
|
||||
|
||||
def apply_binning(
|
||||
df: pd.DataFrame,
|
||||
column: str,
|
||||
method: Literal['custom', 'equal_width', 'equal_freq'],
|
||||
new_column_name: str,
|
||||
bins: Optional[List[Union[int, float]]] = None,
|
||||
labels: Optional[List[Union[str, int]]] = None,
|
||||
num_bins: int = 3
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
应用分箱操作
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
column: 要分箱的列名
|
||||
method: 分箱方法
|
||||
- 'custom': 自定义切点
|
||||
- 'equal_width': 等宽分箱
|
||||
- 'equal_freq': 等频分箱
|
||||
new_column_name: 新列名
|
||||
bins: 自定义切点列表(仅method='custom'时使用),如 [18, 60] → <18, 18-60, >60
|
||||
labels: 标签列表(可选)
|
||||
num_bins: 分组数量(仅method='equal_width'或'equal_freq'时使用)
|
||||
|
||||
Returns:
|
||||
分箱后的数据框
|
||||
|
||||
Examples:
|
||||
>>> df = pd.DataFrame({'年龄': [15, 25, 35, 45, 55, 65, 75]})
|
||||
>>> result = apply_binning(df, '年龄', 'custom', '年龄分组',
|
||||
... bins=[18, 60], labels=['青少年', '成年', '老年'])
|
||||
>>> result['年龄分组'].tolist()
|
||||
['青少年', '成年', '成年', '成年', '成年', '老年', '老年']
|
||||
"""
|
||||
if df.empty:
|
||||
return df
|
||||
|
||||
# 验证列是否存在
|
||||
if column not in df.columns:
|
||||
raise KeyError(f"列 '{column}' 不存在")
|
||||
|
||||
# 验证数据类型
|
||||
if not pd.api.types.is_numeric_dtype(df[column]):
|
||||
raise TypeError(f"列 '{column}' 不是数值类型,无法进行分箱")
|
||||
|
||||
# 创建结果数据框
|
||||
result = df.copy()
|
||||
|
||||
# 根据方法进行分箱
|
||||
if method == 'custom':
|
||||
# 自定义切点
|
||||
if not bins or len(bins) < 2:
|
||||
raise ValueError('自定义切点至少需要2个值')
|
||||
|
||||
# 验证切点是否升序
|
||||
if bins != sorted(bins):
|
||||
raise ValueError('切点必须按升序排列')
|
||||
|
||||
# 验证标签数量
|
||||
if labels and len(labels) != len(bins) - 1:
|
||||
raise ValueError(f'标签数量({len(labels)})必须等于切点数量-1({len(bins)-1})')
|
||||
|
||||
result[new_column_name] = pd.cut(
|
||||
result[column],
|
||||
bins=bins,
|
||||
labels=labels,
|
||||
right=False,
|
||||
include_lowest=True
|
||||
)
|
||||
|
||||
elif method == 'equal_width':
|
||||
# 等宽分箱
|
||||
if num_bins < 2:
|
||||
raise ValueError('分组数量至少为2')
|
||||
|
||||
result[new_column_name] = pd.cut(
|
||||
result[column],
|
||||
bins=num_bins,
|
||||
labels=labels,
|
||||
include_lowest=True
|
||||
)
|
||||
|
||||
elif method == 'equal_freq':
|
||||
# 等频分箱
|
||||
if num_bins < 2:
|
||||
raise ValueError('分组数量至少为2')
|
||||
|
||||
result[new_column_name] = pd.qcut(
|
||||
result[column],
|
||||
q=num_bins,
|
||||
labels=labels,
|
||||
duplicates='drop' # 处理重复边界值
|
||||
)
|
||||
|
||||
else:
|
||||
raise ValueError(f"不支持的分箱方法: {method}")
|
||||
|
||||
# 统计分布
|
||||
print(f'分箱结果分布:')
|
||||
value_counts = result[new_column_name].value_counts().sort_index()
|
||||
for category, count in value_counts.items():
|
||||
percentage = count / len(result) * 100
|
||||
print(f' {category}: {count} 行 ({percentage:.1f}%)')
|
||||
|
||||
# 缺失值统计
|
||||
missing_count = result[new_column_name].isna().sum()
|
||||
if missing_count > 0:
|
||||
print(f'警告: {missing_count} 个值无法分箱(可能是缺失值或边界问题)')
|
||||
|
||||
return result
|
||||
|
||||
109
python-microservice/operations/filter.py
Normal file
109
python-microservice/operations/filter.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""
|
||||
高级筛选操作
|
||||
|
||||
提供多条件筛选功能,支持AND/OR逻辑组合。
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import List, Dict, Any, Literal
|
||||
|
||||
|
||||
def apply_filter(
|
||||
df: pd.DataFrame,
|
||||
conditions: List[Dict[str, Any]],
|
||||
logic: Literal['and', 'or'] = 'and'
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
应用筛选条件
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
conditions: 筛选条件列表,每个条件包含:
|
||||
- column: 列名
|
||||
- operator: 运算符 (=, !=, >, <, >=, <=, contains, not_contains,
|
||||
starts_with, ends_with, is_null, not_null)
|
||||
- value: 值(is_null和not_null不需要)
|
||||
logic: 逻辑组合方式 ('and' 或 'or')
|
||||
|
||||
Returns:
|
||||
筛选后的数据框
|
||||
|
||||
Examples:
|
||||
>>> df = pd.DataFrame({'年龄': [25, 35, 45], '性别': ['男', '女', '男']})
|
||||
>>> conditions = [
|
||||
... {'column': '年龄', 'operator': '>', 'value': 30},
|
||||
... {'column': '性别', 'operator': '=', 'value': '男'}
|
||||
... ]
|
||||
>>> result = apply_filter(df, conditions, logic='and')
|
||||
>>> len(result)
|
||||
1
|
||||
"""
|
||||
if not conditions:
|
||||
raise ValueError('筛选条件不能为空')
|
||||
|
||||
if df.empty:
|
||||
return df
|
||||
|
||||
# 生成各个条件的mask
|
||||
masks = []
|
||||
for cond in conditions:
|
||||
column = cond['column']
|
||||
operator = cond['operator']
|
||||
value = cond.get('value')
|
||||
|
||||
# 验证列是否存在
|
||||
if column not in df.columns:
|
||||
raise KeyError(f"列 '{column}' 不存在")
|
||||
|
||||
# 根据运算符生成mask
|
||||
if operator == '=':
|
||||
mask = df[column] == value
|
||||
elif operator == '!=':
|
||||
mask = df[column] != value
|
||||
elif operator == '>':
|
||||
mask = df[column] > value
|
||||
elif operator == '<':
|
||||
mask = df[column] < value
|
||||
elif operator == '>=':
|
||||
mask = df[column] >= value
|
||||
elif operator == '<=':
|
||||
mask = df[column] <= value
|
||||
elif operator == 'contains':
|
||||
mask = df[column].astype(str).str.contains(str(value), na=False)
|
||||
elif operator == 'not_contains':
|
||||
mask = ~df[column].astype(str).str.contains(str(value), na=False)
|
||||
elif operator == 'starts_with':
|
||||
mask = df[column].astype(str).str.startswith(str(value), na=False)
|
||||
elif operator == 'ends_with':
|
||||
mask = df[column].astype(str).str.endswith(str(value), na=False)
|
||||
elif operator == 'is_null':
|
||||
mask = df[column].isna()
|
||||
elif operator == 'not_null':
|
||||
mask = df[column].notna()
|
||||
else:
|
||||
raise ValueError(f"不支持的运算符: {operator}")
|
||||
|
||||
masks.append(mask)
|
||||
|
||||
# 组合所有条件
|
||||
if logic == 'and':
|
||||
final_mask = pd.concat(masks, axis=1).all(axis=1)
|
||||
elif logic == 'or':
|
||||
final_mask = pd.concat(masks, axis=1).any(axis=1)
|
||||
else:
|
||||
raise ValueError(f"不支持的逻辑运算: {logic}")
|
||||
|
||||
# 应用筛选
|
||||
result = df[final_mask].copy()
|
||||
|
||||
# 打印统计信息
|
||||
original_rows = len(df)
|
||||
filtered_rows = len(result)
|
||||
removed_rows = original_rows - filtered_rows
|
||||
|
||||
print(f'原始数据: {original_rows} 行')
|
||||
print(f'筛选后: {filtered_rows} 行')
|
||||
print(f'删除: {removed_rows} 行 ({removed_rows/original_rows*100:.1f}%)')
|
||||
|
||||
return result
|
||||
|
||||
79
python-microservice/operations/recode.py
Normal file
79
python-microservice/operations/recode.py
Normal file
@@ -0,0 +1,79 @@
|
||||
"""
|
||||
数值映射(重编码)操作
|
||||
|
||||
将分类变量的原始值映射为新值(如:男→1,女→2)。
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
|
||||
def apply_recode(
|
||||
df: pd.DataFrame,
|
||||
column: str,
|
||||
mapping: Dict[Any, Any],
|
||||
create_new_column: bool = True,
|
||||
new_column_name: Optional[str] = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
应用数值映射
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
column: 要重编码的列名
|
||||
mapping: 映射字典,如 {'男': 1, '女': 2}
|
||||
create_new_column: 是否创建新列(True)或覆盖原列(False)
|
||||
new_column_name: 新列名(create_new_column=True时使用)
|
||||
|
||||
Returns:
|
||||
重编码后的数据框
|
||||
|
||||
Examples:
|
||||
>>> df = pd.DataFrame({'性别': ['男', '女', '男', '女']})
|
||||
>>> mapping = {'男': 1, '女': 2}
|
||||
>>> result = apply_recode(df, '性别', mapping, True, '性别_编码')
|
||||
>>> result['性别_编码'].tolist()
|
||||
[1, 2, 1, 2]
|
||||
"""
|
||||
if df.empty:
|
||||
return df
|
||||
|
||||
# 验证列是否存在
|
||||
if column not in df.columns:
|
||||
raise KeyError(f"列 '{column}' 不存在")
|
||||
|
||||
if not mapping:
|
||||
raise ValueError('映射字典不能为空')
|
||||
|
||||
# 确定目标列名
|
||||
if create_new_column:
|
||||
target_column = new_column_name or f'{column}_编码'
|
||||
else:
|
||||
target_column = column
|
||||
|
||||
# 创建结果数据框(避免修改原数据)
|
||||
result = df.copy()
|
||||
|
||||
# 应用映射
|
||||
result[target_column] = result[column].map(mapping)
|
||||
|
||||
# 统计结果
|
||||
mapped_count = result[target_column].notna().sum()
|
||||
unmapped_count = result[target_column].isna().sum()
|
||||
total_count = len(result)
|
||||
|
||||
print(f'映射完成: {mapped_count} 个值成功映射')
|
||||
|
||||
if unmapped_count > 0:
|
||||
print(f'警告: {unmapped_count} 个值未找到对应映射')
|
||||
# 找出未映射的唯一值
|
||||
unmapped_mask = result[target_column].isna()
|
||||
unmapped_values = result.loc[unmapped_mask, column].unique()
|
||||
print(f'未映射的值: {list(unmapped_values)[:10]}') # 最多显示10个
|
||||
|
||||
# 映射成功率
|
||||
success_rate = (mapped_count / total_count * 100) if total_count > 0 else 0
|
||||
print(f'映射成功率: {success_rate:.1f}%')
|
||||
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user