feat(dc): Add multi-metric transformation feature (direction 1+2)
Summary: - Implement intelligent multi-metric grouping detection algorithm - Add direction 1: timepoint-as-row, metric-as-column (analysis format) - Add direction 2: timepoint-as-column, metric-as-row (display format) - Fix column name pattern detection (FMA___ issue) - Maintain original Record ID order in output - Add full-select/clear buttons in UI - Integrate into TransformDialog with Radio selection - Update 3 documentation files Technical Details: - Python: detect_metric_groups(), apply_multi_metric_to_long(), apply_multi_metric_to_matrix() - Backend: 3 new methods in QuickActionService - Frontend: MultiMetricPanel.tsx (531 lines) - Total: ~1460 lines of new code Status: Fully tested and verified, ready for production
This commit is contained in:
@@ -70,6 +70,17 @@ from operations.conditional import apply_conditional_column, apply_simple_binnin
|
||||
from operations.dropna import drop_missing_values, get_missing_summary
|
||||
from operations.compute import compute_column, get_formula_examples
|
||||
from operations.pivot import pivot_long_to_wide, get_pivot_preview
|
||||
from operations.unpivot import apply_unpivot, get_unpivot_preview # ✨ 新增:宽表转长表
|
||||
from operations.metric_time_transform import (
|
||||
apply_metric_time_transform,
|
||||
detect_common_pattern,
|
||||
preview_metric_time_transform,
|
||||
detect_metric_groups, # ✨ 多指标自动分组
|
||||
apply_multi_metric_to_long, # ✨ 多指标转长表(方向1)
|
||||
preview_multi_metric_to_long, # ✨ 多指标转换预览(方向1)
|
||||
apply_multi_metric_to_matrix, # ✨ 多指标转矩阵(方向2)
|
||||
preview_multi_metric_to_matrix # ✨ 多指标转换预览(方向2)
|
||||
)
|
||||
from operations.fillna import fillna_simple, fillna_mice, get_column_missing_stats
|
||||
|
||||
|
||||
@@ -149,6 +160,59 @@ class PivotRequest(BaseModel):
|
||||
pivot_value_order: List[str] = [] # ✨ 新增:透视列值的原始顺序
|
||||
|
||||
|
||||
class UnpivotRequest(BaseModel):
|
||||
"""Unpivot请求模型(宽表转长表)"""
|
||||
data: List[Dict[str, Any]]
|
||||
id_vars: List[str] # ID列(保持不变的列)
|
||||
value_vars: List[str] # 值列(需要转换的列)
|
||||
var_name: str = '变量' # 变量名列名
|
||||
value_name: str = '值' # 值列名
|
||||
parse_column_names: bool = False # 是否解析列名
|
||||
separator: str = '_' # 分隔符
|
||||
metric_name: Optional[str] = None # 指标列名
|
||||
time_name: Optional[str] = None # 时间列名
|
||||
dropna: bool = False # 是否删除缺失值行
|
||||
|
||||
|
||||
class MetricTimeTransformRequest(BaseModel):
|
||||
"""指标-时间表转换请求模型"""
|
||||
data: List[Dict[str, Any]]
|
||||
id_vars: List[str] # ID列(保持不变的列)
|
||||
value_vars: List[str] # 值列(同一指标的多个时间点)
|
||||
metric_name: Optional[str] = None # 指标名称(如果为None,则自动检测)
|
||||
separator: Optional[str] = None # 分隔符(如果为None,则自动检测)
|
||||
timepoint_col_name: str = '时间点' # 时间点列名
|
||||
|
||||
|
||||
class MetricTimeDetectRequest(BaseModel):
|
||||
"""指标-时间表模式检测请求模型"""
|
||||
value_vars: List[str] # 值列(用于检测模式)
|
||||
|
||||
|
||||
class MultiMetricDetectRequest(BaseModel):
|
||||
"""多指标分组检测请求模型"""
|
||||
value_vars: List[str] # 值列(用于检测分组)
|
||||
separators: Optional[List[str]] = None # 可选的分隔符列表
|
||||
|
||||
|
||||
class MultiMetricToLongRequest(BaseModel):
|
||||
"""多指标转长表请求模型(方向1)"""
|
||||
data: List[Dict[str, Any]]
|
||||
id_vars: List[str] # ID列
|
||||
value_vars: List[str] # 值列(多个指标的多个时间点)
|
||||
separators: Optional[List[str]] = None # 可选的分隔符列表
|
||||
event_col_name: str = 'Event_Name' # 时间点列名
|
||||
|
||||
|
||||
class MultiMetricToMatrixRequest(BaseModel):
|
||||
"""多指标转矩阵请求模型(方向2)"""
|
||||
data: List[Dict[str, Any]]
|
||||
id_vars: List[str] # ID列
|
||||
value_vars: List[str] # 值列(多个指标的多个时间点)
|
||||
separators: Optional[List[str]] = None # 可选的分隔符列表
|
||||
metric_col_name: str = '指标名' # 指标列名
|
||||
|
||||
|
||||
class FillnaStatsRequest(BaseModel):
|
||||
"""获取列缺失值统计请求模型"""
|
||||
data: List[Dict[str, Any]]
|
||||
@@ -1292,6 +1356,515 @@ async def operation_pivot(request: PivotRequest):
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
@app.post("/api/operations/unpivot")
|
||||
async def operation_unpivot(request: UnpivotRequest):
|
||||
"""
|
||||
Unpivot操作:宽表转长表(预写函数)
|
||||
|
||||
将横向数据转为纵向重复数据
|
||||
|
||||
典型医学场景:
|
||||
- 多时间点随访数据(FMA_基线、FMA_2周 → 时间点列 + FMA值列)
|
||||
- 多指标合并分析(收缩压、舒张压 → 指标列 + 测量值列)
|
||||
|
||||
Args:
|
||||
request: UnpivotRequest
|
||||
- data: 数据
|
||||
- id_vars: ID列(保持不变的列)
|
||||
- value_vars: 值列(需要转换的列)
|
||||
- var_name: 变量名列名(默认:"变量")
|
||||
- value_name: 值列名(默认:"值")
|
||||
- parse_column_names: 是否解析列名(默认:False)
|
||||
- separator: 分隔符(默认:"_")
|
||||
- metric_name: 指标列名(可选)
|
||||
- time_name: 时间列名(可选)
|
||||
- dropna: 是否删除缺失值行(默认:False)
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"result_data": List[Dict],
|
||||
"output": str,
|
||||
"execution_time": float,
|
||||
"result_shape": [rows, cols]
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import time
|
||||
import io
|
||||
import sys
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 捕获打印输出
|
||||
captured_output = io.StringIO()
|
||||
sys.stdout = captured_output
|
||||
|
||||
try:
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# ✨ 调用预写函数
|
||||
result_df = apply_unpivot(
|
||||
df,
|
||||
request.id_vars,
|
||||
request.value_vars,
|
||||
request.var_name,
|
||||
request.value_name,
|
||||
request.parse_column_names,
|
||||
request.separator,
|
||||
request.metric_name,
|
||||
request.time_name,
|
||||
request.dropna
|
||||
)
|
||||
|
||||
# 转换回JSON(处理NaN和inf值)
|
||||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||||
result_data = result_df_clean.to_dict('records')
|
||||
|
||||
# 恢复stdout
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"Unpivot成功: {len(request.id_vars)} ID列 × {len(request.value_vars)} 值列 → {len(result_data)} 行")
|
||||
|
||||
return JSONResponse(content={
|
||||
"success": True,
|
||||
"result_data": result_data,
|
||||
"output": output,
|
||||
"execution_time": execution_time,
|
||||
"result_shape": [len(result_data), len(result_df.columns)]
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
raise e
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Unpivot操作失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
@app.post("/api/operations/metric-time/detect")
|
||||
async def operation_metric_time_detect(request: MetricTimeDetectRequest):
|
||||
"""
|
||||
检测指标-时间表转换模式
|
||||
|
||||
自动分析列名,检测:
|
||||
- 公共前缀(指标名)
|
||||
- 分隔符
|
||||
- 时间点列表
|
||||
- 置信度
|
||||
|
||||
Args:
|
||||
request: MetricTimeDetectRequest
|
||||
- value_vars: 值列列表
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"pattern": {
|
||||
"common_prefix": str,
|
||||
"separator": str,
|
||||
"timepoints": List[str],
|
||||
"confidence": float,
|
||||
"message": str
|
||||
}
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import time
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
logger.info(f"检测指标-时间表模式: {len(request.value_vars)} 列")
|
||||
|
||||
# 调用检测函数
|
||||
pattern = detect_common_pattern(request.value_vars)
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"模式检测完成: confidence={pattern.get('confidence', 0):.2f}")
|
||||
|
||||
return JSONResponse(content={
|
||||
"success": pattern['success'],
|
||||
"pattern": pattern,
|
||||
"execution_time": execution_time
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"模式检测失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
@app.post("/api/operations/metric-time")
|
||||
async def operation_metric_time_transform(request: MetricTimeTransformRequest):
|
||||
"""
|
||||
指标-时间表转换操作(预写函数)
|
||||
|
||||
将多个时间点列转换为"指标行+时间点列"格式
|
||||
|
||||
典型场景:
|
||||
- 制作临床研究Table 1
|
||||
- 横向对比同一指标的时间变化
|
||||
|
||||
Args:
|
||||
request: MetricTimeTransformRequest
|
||||
- data: 数据
|
||||
- id_vars: ID列(保持不变)
|
||||
- value_vars: 值列(同一指标的多个时间点)
|
||||
- metric_name: 指标名称(可选,自动检测)
|
||||
- separator: 分隔符(可选,自动检测)
|
||||
- timepoint_col_name: 时间点列名
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"result_data": List[Dict],
|
||||
"output": str,
|
||||
"execution_time": float,
|
||||
"result_shape": [rows, cols]
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import time
|
||||
import io
|
||||
import sys
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 捕获打印输出
|
||||
captured_output = io.StringIO()
|
||||
sys.stdout = captured_output
|
||||
|
||||
try:
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# ✨ 调用预写函数
|
||||
result_df = apply_metric_time_transform(
|
||||
df,
|
||||
request.id_vars,
|
||||
request.value_vars,
|
||||
request.metric_name,
|
||||
request.separator,
|
||||
request.timepoint_col_name
|
||||
)
|
||||
|
||||
# 转换回JSON(处理NaN和inf值)
|
||||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||||
result_data = result_df_clean.to_dict('records')
|
||||
|
||||
# 恢复stdout
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"指标-时间表转换成功: {len(request.id_vars)} ID列 × {len(request.value_vars)} 值列 → {len(result_df.columns)} 列")
|
||||
|
||||
return JSONResponse(content={
|
||||
"success": True,
|
||||
"result_data": result_data,
|
||||
"output": output,
|
||||
"execution_time": execution_time,
|
||||
"result_shape": [len(result_data), len(result_df.columns)]
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
raise e
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"指标-时间表转换失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
# ==================== 多指标转换API ====================
|
||||
|
||||
@app.post("/api/operations/multi-metric/detect")
|
||||
async def operation_multi_metric_detect(request: MultiMetricDetectRequest):
|
||||
"""
|
||||
多指标自动分组检测
|
||||
|
||||
检测多个指标的列并自动分组
|
||||
|
||||
Args:
|
||||
request: MultiMetricDetectRequest
|
||||
- value_vars: 值列列表
|
||||
- separators: 可选的分隔符列表
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"metric_groups": Dict[str, List[str]], # 指标分组
|
||||
"separator": str, # 检测到的分隔符
|
||||
"timepoints": List[str], # 时间点列表
|
||||
"confidence": float, # 置信度
|
||||
"message": str
|
||||
}
|
||||
"""
|
||||
try:
|
||||
result = detect_metric_groups(
|
||||
request.value_vars,
|
||||
request.separators
|
||||
)
|
||||
|
||||
logger.info(f"多指标分组检测: {len(request.value_vars)} 列 → {len(result.get('metric_groups', {}))} 个指标")
|
||||
|
||||
return JSONResponse(content=result)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"多指标分组检测失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
@app.post("/api/operations/multi-metric/to-long")
|
||||
async def operation_multi_metric_to_long(request: MultiMetricToLongRequest):
|
||||
"""
|
||||
多指标转长表(时间点为行,指标为列)
|
||||
|
||||
将多个指标的宽表转换为长表格式,适合统计分析和可视化
|
||||
|
||||
典型场景:
|
||||
- 纵向研究数据分析
|
||||
- 重复测量数据准备
|
||||
- 混合效应模型、GEE分析
|
||||
- 数据可视化(ggplot2、seaborn)
|
||||
|
||||
Args:
|
||||
request: MultiMetricToLongRequest
|
||||
- data: 数据
|
||||
- id_vars: ID列
|
||||
- value_vars: 值列(多个指标的多个时间点)
|
||||
- separators: 可选的分隔符列表
|
||||
- event_col_name: 时间点列名
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"result_data": List[Dict],
|
||||
"grouping": {...}, # 分组信息
|
||||
"output": str,
|
||||
"execution_time": float,
|
||||
"result_shape": [rows, cols]
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import time
|
||||
import io
|
||||
import sys
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 捕获打印输出
|
||||
captured_output = io.StringIO()
|
||||
sys.stdout = captured_output
|
||||
|
||||
try:
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# 1. 先检测分组
|
||||
grouping = detect_metric_groups(
|
||||
request.value_vars,
|
||||
request.separators
|
||||
)
|
||||
|
||||
if not grouping['success']:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": grouping['message'],
|
||||
"output": output
|
||||
}, status_code=400)
|
||||
|
||||
# 2. 执行转换
|
||||
result_df = apply_multi_metric_to_long(
|
||||
df,
|
||||
request.id_vars,
|
||||
grouping['metric_groups'],
|
||||
grouping['separator'],
|
||||
request.event_col_name
|
||||
)
|
||||
|
||||
# 转换回JSON(处理NaN和inf值)
|
||||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||||
result_data = result_df_clean.to_dict('records')
|
||||
|
||||
# 恢复stdout
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"多指标转长表成功: {len(grouping['metric_groups'])} 指标 × {len(grouping['timepoints'])} 时间点 → {len(result_df)} 行")
|
||||
|
||||
return JSONResponse(content={
|
||||
"success": True,
|
||||
"result_data": result_data,
|
||||
"grouping": grouping,
|
||||
"output": output,
|
||||
"execution_time": execution_time,
|
||||
"result_shape": [len(result_data), len(result_df.columns)]
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
raise e
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"多指标转长表失败: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
@app.post("/api/operations/multi-metric/to-matrix")
|
||||
async def operation_multi_metric_to_matrix(request: MultiMetricToMatrixRequest):
|
||||
"""
|
||||
多指标转矩阵(时间点为列,指标为行)
|
||||
|
||||
将多个指标的宽表转换为矩阵格式,适合临床报告和数据审查
|
||||
|
||||
典型场景:
|
||||
- 临床研究报告
|
||||
- 数据审查表
|
||||
- CRF核对
|
||||
- 单受试者数据审查
|
||||
|
||||
Args:
|
||||
request: MultiMetricToMatrixRequest
|
||||
- data: 数据
|
||||
- id_vars: ID列
|
||||
- value_vars: 值列(多个指标的多个时间点)
|
||||
- separators: 可选的分隔符列表
|
||||
- metric_col_name: 指标列名
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"result_data": List[Dict],
|
||||
"grouping": {...}, # 分组信息
|
||||
"output": str,
|
||||
"execution_time": float,
|
||||
"result_shape": [rows, cols]
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import time
|
||||
import io
|
||||
import sys
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 捕获打印输出
|
||||
captured_output = io.StringIO()
|
||||
sys.stdout = captured_output
|
||||
|
||||
try:
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# 1. 先检测分组
|
||||
grouping = detect_metric_groups(
|
||||
request.value_vars,
|
||||
request.separators
|
||||
)
|
||||
|
||||
if not grouping['success']:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": grouping['message'],
|
||||
"output": output
|
||||
}, status_code=400)
|
||||
|
||||
# 2. 执行转换
|
||||
result_df = apply_multi_metric_to_matrix(
|
||||
df,
|
||||
request.id_vars,
|
||||
grouping['metric_groups'],
|
||||
grouping['separator'],
|
||||
'Event_Name',
|
||||
request.metric_col_name
|
||||
)
|
||||
|
||||
# 转换回JSON(处理NaN和inf值)
|
||||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||||
result_data = result_df_clean.to_dict('records')
|
||||
|
||||
# 恢复stdout
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"多指标转矩阵成功: {len(grouping['metric_groups'])} 指标 × {len(grouping['timepoints'])} 时间点 → {len(result_df)} 行")
|
||||
|
||||
return JSONResponse(content={
|
||||
"success": True,
|
||||
"result_data": result_data,
|
||||
"grouping": grouping,
|
||||
"output": output,
|
||||
"execution_time": execution_time,
|
||||
"result_shape": [len(result_data), len(result_df.columns)]
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
raise e
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"多指标转矩阵失败: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
@app.post("/api/operations/fillna-stats")
|
||||
async def operation_fillna_stats(request: FillnaStatsRequest):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user