feat(dc): Add multi-metric transformation feature (direction 1+2)

Summary:
- Implement intelligent multi-metric grouping detection algorithm
- Add direction 1: timepoint-as-row, metric-as-column (analysis format)
- Add direction 2: timepoint-as-column, metric-as-row (display format)
- Fix column name pattern detection (FMA___ issue)
- Maintain original Record ID order in output
- Add full-select/clear buttons in UI
- Integrate into TransformDialog with Radio selection
- Update 3 documentation files

Technical Details:
- Python: detect_metric_groups(), apply_multi_metric_to_long(), apply_multi_metric_to_matrix()
- Backend: 3 new methods in QuickActionService
- Frontend: MultiMetricPanel.tsx (531 lines)
- Total: ~1460 lines of new code

Status: Fully tested and verified, ready for production
This commit is contained in:
2025-12-21 15:06:15 +08:00
parent 8be8cdcf53
commit 9b81aef9a7
123 changed files with 4781 additions and 150 deletions

View File

@@ -70,6 +70,17 @@ from operations.conditional import apply_conditional_column, apply_simple_binnin
from operations.dropna import drop_missing_values, get_missing_summary
from operations.compute import compute_column, get_formula_examples
from operations.pivot import pivot_long_to_wide, get_pivot_preview
from operations.unpivot import apply_unpivot, get_unpivot_preview # ✨ 新增:宽表转长表
from operations.metric_time_transform import (
apply_metric_time_transform,
detect_common_pattern,
preview_metric_time_transform,
detect_metric_groups, # ✨ 多指标自动分组
apply_multi_metric_to_long, # ✨ 多指标转长表方向1
preview_multi_metric_to_long, # ✨ 多指标转换预览方向1
apply_multi_metric_to_matrix, # ✨ 多指标转矩阵方向2
preview_multi_metric_to_matrix # ✨ 多指标转换预览方向2
)
from operations.fillna import fillna_simple, fillna_mice, get_column_missing_stats
@@ -149,6 +160,59 @@ class PivotRequest(BaseModel):
pivot_value_order: List[str] = [] # ✨ 新增:透视列值的原始顺序
class UnpivotRequest(BaseModel):
"""Unpivot请求模型宽表转长表"""
data: List[Dict[str, Any]]
id_vars: List[str] # ID列保持不变的列
value_vars: List[str] # 值列(需要转换的列)
var_name: str = '变量' # 变量名列名
value_name: str = '' # 值列名
parse_column_names: bool = False # 是否解析列名
separator: str = '_' # 分隔符
metric_name: Optional[str] = None # 指标列名
time_name: Optional[str] = None # 时间列名
dropna: bool = False # 是否删除缺失值行
class MetricTimeTransformRequest(BaseModel):
"""指标-时间表转换请求模型"""
data: List[Dict[str, Any]]
id_vars: List[str] # ID列保持不变的列
value_vars: List[str] # 值列(同一指标的多个时间点)
metric_name: Optional[str] = None # 指标名称如果为None则自动检测
separator: Optional[str] = None # 分隔符如果为None则自动检测
timepoint_col_name: str = '时间点' # 时间点列名
class MetricTimeDetectRequest(BaseModel):
"""指标-时间表模式检测请求模型"""
value_vars: List[str] # 值列(用于检测模式)
class MultiMetricDetectRequest(BaseModel):
"""多指标分组检测请求模型"""
value_vars: List[str] # 值列(用于检测分组)
separators: Optional[List[str]] = None # 可选的分隔符列表
class MultiMetricToLongRequest(BaseModel):
"""多指标转长表请求模型方向1"""
data: List[Dict[str, Any]]
id_vars: List[str] # ID列
value_vars: List[str] # 值列(多个指标的多个时间点)
separators: Optional[List[str]] = None # 可选的分隔符列表
event_col_name: str = 'Event_Name' # 时间点列名
class MultiMetricToMatrixRequest(BaseModel):
"""多指标转矩阵请求模型方向2"""
data: List[Dict[str, Any]]
id_vars: List[str] # ID列
value_vars: List[str] # 值列(多个指标的多个时间点)
separators: Optional[List[str]] = None # 可选的分隔符列表
metric_col_name: str = '指标名' # 指标列名
class FillnaStatsRequest(BaseModel):
"""获取列缺失值统计请求模型"""
data: List[Dict[str, Any]]
@@ -1292,6 +1356,515 @@ async def operation_pivot(request: PivotRequest):
}, status_code=400)
@app.post("/api/operations/unpivot")
async def operation_unpivot(request: UnpivotRequest):
"""
Unpivot操作宽表转长表预写函数
将横向数据转为纵向重复数据
典型医学场景:
- 多时间点随访数据FMA_基线、FMA_2周 → 时间点列 + FMA值列
- 多指标合并分析(收缩压、舒张压 → 指标列 + 测量值列)
Args:
request: UnpivotRequest
- data: 数据
- id_vars: ID列保持不变的列
- value_vars: 值列(需要转换的列)
- var_name: 变量名列名(默认:"变量"
- value_name: 值列名(默认:""
- parse_column_names: 是否解析列名默认False
- separator: 分隔符(默认:"_"
- metric_name: 指标列名(可选)
- time_name: 时间列名(可选)
- dropna: 是否删除缺失值行默认False
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float,
"result_shape": [rows, cols]
}
"""
try:
import pandas as pd
import numpy as np
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# ✨ 调用预写函数
result_df = apply_unpivot(
df,
request.id_vars,
request.value_vars,
request.var_name,
request.value_name,
request.parse_column_names,
request.separator,
request.metric_name,
request.time_name,
request.dropna
)
# 转换回JSON处理NaN和inf值
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"Unpivot成功: {len(request.id_vars)} ID列 × {len(request.value_vars)} 值列 → {len(result_data)}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"Unpivot操作失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/metric-time/detect")
async def operation_metric_time_detect(request: MetricTimeDetectRequest):
"""
检测指标-时间表转换模式
自动分析列名,检测:
- 公共前缀(指标名)
- 分隔符
- 时间点列表
- 置信度
Args:
request: MetricTimeDetectRequest
- value_vars: 值列列表
Returns:
{
"success": bool,
"pattern": {
"common_prefix": str,
"separator": str,
"timepoints": List[str],
"confidence": float,
"message": str
}
}
"""
try:
import time
start_time = time.time()
logger.info(f"检测指标-时间表模式: {len(request.value_vars)}")
# 调用检测函数
pattern = detect_common_pattern(request.value_vars)
execution_time = time.time() - start_time
logger.info(f"模式检测完成: confidence={pattern.get('confidence', 0):.2f}")
return JSONResponse(content={
"success": pattern['success'],
"pattern": pattern,
"execution_time": execution_time
})
except Exception as e:
logger.error(f"模式检测失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/metric-time")
async def operation_metric_time_transform(request: MetricTimeTransformRequest):
"""
指标-时间表转换操作(预写函数)
将多个时间点列转换为"指标行+时间点列"格式
典型场景:
- 制作临床研究Table 1
- 横向对比同一指标的时间变化
Args:
request: MetricTimeTransformRequest
- data: 数据
- id_vars: ID列保持不变
- value_vars: 值列(同一指标的多个时间点)
- metric_name: 指标名称(可选,自动检测)
- separator: 分隔符(可选,自动检测)
- timepoint_col_name: 时间点列名
Returns:
{
"success": bool,
"result_data": List[Dict],
"output": str,
"execution_time": float,
"result_shape": [rows, cols]
}
"""
try:
import pandas as pd
import numpy as np
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# ✨ 调用预写函数
result_df = apply_metric_time_transform(
df,
request.id_vars,
request.value_vars,
request.metric_name,
request.separator,
request.timepoint_col_name
)
# 转换回JSON处理NaN和inf值
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"指标-时间表转换成功: {len(request.id_vars)} ID列 × {len(request.value_vars)} 值列 → {len(result_df.columns)}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"指标-时间表转换失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
# ==================== 多指标转换API ====================
@app.post("/api/operations/multi-metric/detect")
async def operation_multi_metric_detect(request: MultiMetricDetectRequest):
"""
多指标自动分组检测
检测多个指标的列并自动分组
Args:
request: MultiMetricDetectRequest
- value_vars: 值列列表
- separators: 可选的分隔符列表
Returns:
{
"success": bool,
"metric_groups": Dict[str, List[str]], # 指标分组
"separator": str, # 检测到的分隔符
"timepoints": List[str], # 时间点列表
"confidence": float, # 置信度
"message": str
}
"""
try:
result = detect_metric_groups(
request.value_vars,
request.separators
)
logger.info(f"多指标分组检测: {len(request.value_vars)} 列 → {len(result.get('metric_groups', {}))} 个指标")
return JSONResponse(content=result)
except Exception as e:
logger.error(f"多指标分组检测失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e)
}, status_code=400)
@app.post("/api/operations/multi-metric/to-long")
async def operation_multi_metric_to_long(request: MultiMetricToLongRequest):
"""
多指标转长表(时间点为行,指标为列)
将多个指标的宽表转换为长表格式,适合统计分析和可视化
典型场景:
- 纵向研究数据分析
- 重复测量数据准备
- 混合效应模型、GEE分析
- 数据可视化ggplot2、seaborn
Args:
request: MultiMetricToLongRequest
- data: 数据
- id_vars: ID列
- value_vars: 值列(多个指标的多个时间点)
- separators: 可选的分隔符列表
- event_col_name: 时间点列名
Returns:
{
"success": bool,
"result_data": List[Dict],
"grouping": {...}, # 分组信息
"output": str,
"execution_time": float,
"result_shape": [rows, cols]
}
"""
try:
import pandas as pd
import numpy as np
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 1. 先检测分组
grouping = detect_metric_groups(
request.value_vars,
request.separators
)
if not grouping['success']:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
return JSONResponse(content={
"success": False,
"error": grouping['message'],
"output": output
}, status_code=400)
# 2. 执行转换
result_df = apply_multi_metric_to_long(
df,
request.id_vars,
grouping['metric_groups'],
grouping['separator'],
request.event_col_name
)
# 转换回JSON处理NaN和inf值
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"多指标转长表成功: {len(grouping['metric_groups'])} 指标 × {len(grouping['timepoints'])} 时间点 → {len(result_df)}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"grouping": grouping,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"多指标转长表失败: {str(e)}")
import traceback
traceback.print_exc()
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/multi-metric/to-matrix")
async def operation_multi_metric_to_matrix(request: MultiMetricToMatrixRequest):
"""
多指标转矩阵(时间点为列,指标为行)
将多个指标的宽表转换为矩阵格式,适合临床报告和数据审查
典型场景:
- 临床研究报告
- 数据审查表
- CRF核对
- 单受试者数据审查
Args:
request: MultiMetricToMatrixRequest
- data: 数据
- id_vars: ID列
- value_vars: 值列(多个指标的多个时间点)
- separators: 可选的分隔符列表
- metric_col_name: 指标列名
Returns:
{
"success": bool,
"result_data": List[Dict],
"grouping": {...}, # 分组信息
"output": str,
"execution_time": float,
"result_shape": [rows, cols]
}
"""
try:
import pandas as pd
import numpy as np
import time
import io
import sys
start_time = time.time()
# 捕获打印输出
captured_output = io.StringIO()
sys.stdout = captured_output
try:
# 转换为DataFrame
df = pd.DataFrame(request.data)
# 1. 先检测分组
grouping = detect_metric_groups(
request.value_vars,
request.separators
)
if not grouping['success']:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
return JSONResponse(content={
"success": False,
"error": grouping['message'],
"output": output
}, status_code=400)
# 2. 执行转换
result_df = apply_multi_metric_to_matrix(
df,
request.id_vars,
grouping['metric_groups'],
grouping['separator'],
'Event_Name',
request.metric_col_name
)
# 转换回JSON处理NaN和inf值
result_df = result_df.replace([np.inf, -np.inf], None)
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
result_data = result_df_clean.to_dict('records')
# 恢复stdout
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
execution_time = time.time() - start_time
logger.info(f"多指标转矩阵成功: {len(grouping['metric_groups'])} 指标 × {len(grouping['timepoints'])} 时间点 → {len(result_df)}")
return JSONResponse(content={
"success": True,
"result_data": result_data,
"grouping": grouping,
"output": output,
"execution_time": execution_time,
"result_shape": [len(result_data), len(result_df.columns)]
})
except Exception as e:
sys.stdout = sys.__stdout__
output = captured_output.getvalue()
raise e
except Exception as e:
logger.error(f"多指标转矩阵失败: {str(e)}")
import traceback
traceback.print_exc()
return JSONResponse(content={
"success": False,
"error": str(e),
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
}, status_code=400)
@app.post("/api/operations/fillna-stats")
async def operation_fillna_stats(request: FillnaStatsRequest):
"""

View File

@@ -24,3 +24,9 @@ __version__ = '1.0.0'

View File

@@ -157,3 +157,9 @@ def get_missing_summary(df: pd.DataFrame) -> dict:

View File

@@ -117,3 +117,9 @@ def apply_filter(

View File

@@ -0,0 +1,921 @@
"""
指标-时间表转换Metric-Time Transform
将多个时间点列转换为"指标行+时间点列"格式
典型医学场景:
- 制作临床研究Table 1
- 横向对比同一指标的时间变化
- 多时间点随访数据整理
示例:
输入(宽表):
Record_ID | FMA___基线 | FMA___2周 | FMA___1月
10 | 54 | 93 | 68
11 | 16 | 31 | 72
输出(指标-时间表):
Record_ID | 时间点 | 基线 | 2周 | 1月
10 | FMA | 54 | 93 | 68
11 | FMA | 16 | 31 | 72
"""
import pandas as pd
import numpy as np
from typing import List, Optional, Dict, Any
import os
from collections import defaultdict
def detect_common_pattern(column_names: List[str]) -> Dict[str, Any]:
"""
自动检测列名的公共模式(前缀、分隔符、时间点)
Args:
column_names: 列名列表
Returns:
{
'success': bool,
'common_prefix': str, # 公共前缀(指标名)
'separator': str, # 分隔符
'timepoints': List[str], # 时间点列表
'confidence': float, # 置信度 0-1
'message': str # 提示信息
}
Examples:
>>> cols = ['FMA总得分___筛选及基线', 'FMA总得分___随访(2周)', 'FMA总得分___随访(1个月)']
>>> result = detect_common_pattern(cols)
>>> result['common_prefix']
'FMA总得分'
>>> result['separator']
'___'
>>> result['timepoints']
['筛选及基线', '随访(2周)', '随访(1个月)']
"""
print(f"\n🔍 开始自动检测列名模式...", flush=True)
print(f" 输入列数: {len(column_names)}", flush=True)
if len(column_names) < 2:
return {
'success': False,
'common_prefix': '',
'separator': '',
'timepoints': [],
'confidence': 0.0,
'message': '至少需要2列才能检测模式'
}
# 打印前3个列名作为样本
print(f" 样本列名:", flush=True)
for i, col in enumerate(column_names[:3]):
print(f" [{i+1}] {col}", flush=True)
if len(column_names) > 3:
print(f" ... 还有 {len(column_names) - 3}", flush=True)
# ==================== 1. 检测最长公共前缀 ====================
common_prefix = os.path.commonprefix(column_names)
print(f"\n ✓ 检测到公共前缀: '{common_prefix}'", flush=True)
if not common_prefix:
return {
'success': False,
'common_prefix': '',
'separator': '',
'timepoints': [],
'confidence': 0.0,
'message': '未检测到公共前缀,选中的列可能不属于同一指标'
}
# ==================== 2. 检测分隔符 ====================
# 尝试常见分隔符(按优先级排序)
separators = ['___', '__', '_', '-', '.', '|', ' - ', ' ']
detected_separator = None
# 方法1检查公共前缀是否以分隔符结尾
for sep in separators:
if common_prefix.endswith(sep):
detected_separator = sep
common_prefix = common_prefix[:-len(sep)] # 移除尾部分隔符
print(f" ✓ 检测到分隔符: '{sep}' (位于公共前缀末尾)", flush=True)
break
# 方法2如果公共前缀末尾没有分隔符尝试从剩余部分检测
if not detected_separator:
remainders = [col[len(common_prefix):] for col in column_names]
for sep in separators:
if all(r.startswith(sep) for r in remainders if r):
detected_separator = sep
print(f" ✓ 检测到分隔符: '{sep}' (位于剩余部分开头)", flush=True)
break
# ✨ 方法3智能修正 - 如果剩余部分仍包含分隔符,尝试扩展公共前缀
if detected_separator:
remainders = [col[len(common_prefix):] for col in column_names]
# 检查每个剩余部分,看分隔符前是否还有公共部分
parts_before_sep = []
for remainder in remainders:
if detected_separator in remainder:
# 找到第一个分隔符的位置
sep_pos = remainder.find(detected_separator)
part = remainder[:sep_pos]
parts_before_sep.append(part)
else:
parts_before_sep.append('')
# 如果所有剩余部分在分隔符前都有内容,且内容相同,则扩展公共前缀
if parts_before_sep and all(p == parts_before_sep[0] for p in parts_before_sep if p):
additional_prefix = parts_before_sep[0]
if additional_prefix:
print(f" 🔄 智能修正: 扩展公共前缀 '{common_prefix}''{common_prefix}{additional_prefix}'", flush=True)
common_prefix = common_prefix + additional_prefix
if not detected_separator:
print(f" ⚠️ 未检测到明确分隔符,使用空字符串", flush=True)
detected_separator = ''
# ==================== 3. 提取时间点 ====================
if detected_separator:
# ✨ 修复正确移除分隔符移除整个分隔符字符串而不是lstrip
timepoints = []
for col in column_names:
remainder = col[len(common_prefix):]
# 如果剩余部分以分隔符开头,移除它
if remainder.startswith(detected_separator):
timepoint = remainder[len(detected_separator):]
else:
timepoint = remainder
timepoints.append(timepoint.strip())
else:
# 没有分隔符,整个剩余部分作为时间点
timepoints = [col[len(common_prefix):].strip() for col in column_names]
print(f" ✓ 提取到 {len(timepoints)} 个时间点:", flush=True)
for i, tp in enumerate(timepoints[:5]):
print(f" [{i+1}] {tp}", flush=True)
if len(timepoints) > 5:
print(f" ... 还有 {len(timepoints) - 5}", flush=True)
# ==================== 4. 计算置信度 ====================
confidence = 1.0
# 检查:时间点不能为空
empty_count = sum(1 for tp in timepoints if not tp)
if empty_count > 0:
confidence -= 0.3
print(f" ⚠️ 发现 {empty_count} 个空时间点,降低置信度", flush=True)
# 检查:时间点应该各不相同
unique_timepoints = len(set(timepoints))
if unique_timepoints < len(timepoints):
confidence -= 0.2
print(f" ⚠️ 时间点有重复,降低置信度", flush=True)
# 检查:公共前缀不应该太短
if len(common_prefix) < 2:
confidence -= 0.2
print(f" ⚠️ 公共前缀过短,降低置信度", flush=True)
confidence = max(0.0, min(1.0, confidence))
print(f"\n 📊 检测置信度: {confidence:.0%}", flush=True)
# ==================== 5. 生成消息 ====================
if confidence >= 0.8:
message = f"成功检测:指标='{common_prefix}', 分隔符='{detected_separator}', {len(timepoints)}个时间点"
elif confidence >= 0.5:
message = f"检测成功但有警告,建议检查结果"
else:
message = f"检测置信度较低,建议手动指定参数"
return {
'success': True,
'common_prefix': common_prefix,
'separator': detected_separator,
'timepoints': timepoints,
'confidence': confidence,
'message': message
}
def apply_metric_time_transform(
df: pd.DataFrame,
id_vars: List[str],
value_vars: List[str],
metric_name: Optional[str] = None,
separator: Optional[str] = None,
timepoint_col_name: str = '时间点'
) -> pd.DataFrame:
"""
应用指标-时间表转换
Args:
df: 输入数据框
id_vars: ID列保持不变的列
value_vars: 值列(同一指标的多个时间点)
metric_name: 指标名称如果为None则自动检测
separator: 分隔符如果为None则自动检测
timepoint_col_name: 时间点列的列名(默认:"时间点"
Returns:
转换后的数据框
Examples:
>>> df = pd.DataFrame({
... 'Record_ID': [10, 11],
... 'FMA___基线': [54, 16],
... 'FMA___2周': [93, 31],
... 'FMA___1月': [68, 72]
... })
>>> result = apply_metric_time_transform(
... df,
... id_vars=['Record_ID'],
... value_vars=['FMA___基线', 'FMA___2周', 'FMA___1月']
... )
>>> result.columns.tolist()
['Record_ID', '时间点', '基线', '2周', '1月']
"""
print("\n" + "="*60, flush=True)
print("🔄 开始指标-时间表转换...", flush=True)
print("="*60, flush=True)
# ==================== 参数验证 ====================
if df.empty:
print("⚠️ 输入数据框为空", flush=True)
return df
if not id_vars:
raise ValueError('❌ 至少需要选择1个ID列')
if len(value_vars) < 2:
raise ValueError('❌ 至少需要选择2个值列')
# 验证列是否存在
for col in id_vars + value_vars:
if col not in df.columns:
raise KeyError(f"❌ 列 '{col}' 不存在")
print(f"\n📊 转换前数据概况:", flush=True)
print(f" - 总行数: {len(df)}", flush=True)
print(f" - ID列: {len(id_vars)} 个 ({', '.join(id_vars)})", flush=True)
print(f" - 值列: {len(value_vars)}", flush=True)
# ==================== 自动检测或使用指定参数 ====================
if not metric_name or separator is None:
print(f"\n🔍 自动检测模式...", flush=True)
pattern = detect_common_pattern(value_vars)
if not pattern['success']:
raise ValueError(f"❌ 自动检测失败: {pattern['message']}")
metric_name = metric_name or pattern['common_prefix']
separator = separator if separator is not None else pattern['separator']
timepoints = pattern['timepoints']
print(f"\n✅ 使用检测结果:", flush=True)
print(f" - 指标名: '{metric_name}'", flush=True)
print(f" - 分隔符: '{separator}'", flush=True)
print(f" - 置信度: {pattern['confidence']:.0%}", flush=True)
else:
print(f"\n✅ 使用手动指定参数:", flush=True)
print(f" - 指标名: '{metric_name}'", flush=True)
print(f" - 分隔符: '{separator}'", flush=True)
# 手动拆分时间点
timepoints = []
for col in value_vars:
if separator and separator in col:
# 移除指标名和分隔符
remainder = col.replace(metric_name, '', 1).lstrip(separator)
timepoints.append(remainder)
else:
# 直接移除指标名
remainder = col.replace(metric_name, '', 1)
timepoints.append(remainder.strip())
# ==================== 构建结果DataFrame ====================
print(f"\n🔨 开始构建结果数据...", flush=True)
result_rows = []
for idx, row in df.iterrows():
result_row = {}
# 1. 复制ID列
for id_col in id_vars:
result_row[id_col] = row[id_col]
# 2. 添加时间点列(实际存储的是指标名)
result_row[timepoint_col_name] = metric_name
# 3. 添加各个时间点的值作为独立列
for original_col, timepoint in zip(value_vars, timepoints):
result_row[timepoint] = row[original_col]
result_rows.append(result_row)
result_df = pd.DataFrame(result_rows)
# ==================== 调整列顺序 ====================
# 顺序ID列 + 时间点列 + 各时间点列
column_order = id_vars + [timepoint_col_name] + timepoints
result_df = result_df[column_order]
# ==================== 统计输出 ====================
print(f"\n{'='*60}", flush=True)
print(f"✅ 指标-时间表转换完成!", flush=True)
print(f"{'='*60}", flush=True)
print(f"📊 转换结果:", flush=True)
print(f" - 总行数: {len(result_df)} (不变)", flush=True)
print(f" - 总列数: {len(result_df.columns)} (ID列 + 时间点列 + {len(timepoints)}个时间点列)", flush=True)
print(f" - 指标名: {metric_name}", flush=True)
print(f" - 时间点: {', '.join(timepoints[:5])}{'...' if len(timepoints) > 5 else ''}", flush=True)
# 显示前3行示例
print(f"\n 前3行数据示例:", flush=True)
for idx, row in result_df.head(3).iterrows():
row_preview = ' | '.join([f"{col}={row[col]}" for col in result_df.columns[:4]])
print(f" [{idx}] {row_preview}...", flush=True)
return result_df
def preview_metric_time_transform(
df: pd.DataFrame,
id_vars: List[str],
value_vars: List[str],
preview_rows: int = 5
) -> Dict[str, Any]:
"""
预览指标-时间表转换结果(不实际执行完整转换)
Args:
df: 输入数据框
id_vars: ID列
value_vars: 值列
preview_rows: 预览行数
Returns:
{
'pattern': {
'common_prefix': str,
'separator': str,
'timepoints': List[str],
'confidence': float
},
'original_shape': (rows, cols),
'new_shape': (rows, cols),
'preview_data': List[Dict],
'estimated_change': str
}
"""
# 检测模式
pattern = detect_common_pattern(value_vars)
if not pattern['success']:
return {
'success': False,
'error': pattern['message']
}
# 对前几行执行转换
preview_df = df.head(preview_rows)
try:
result_preview = apply_metric_time_transform(
preview_df,
id_vars,
value_vars,
pattern['common_prefix'],
pattern['separator']
)
return {
'success': True,
'pattern': pattern,
'original_shape': (len(df), len(df.columns)),
'new_shape': (len(df), len(id_vars) + 1 + len(pattern['timepoints'])),
'preview_data': result_preview.to_dict('records'),
'estimated_change': f"列数: {len(df.columns)}{len(id_vars) + 1 + len(pattern['timepoints'])} (ID列 + 时间点列 + {len(pattern['timepoints'])}个时间点列)"
}
except Exception as e:
return {
'success': False,
'error': str(e)
}
# ==================== 多指标转换方向1时间点为行指标为列====================
def detect_metric_groups(
column_names: List[str],
separators: Optional[List[str]] = None
) -> Dict[str, Any]:
"""
自动检测并分组多个指标的列
参数:
column_names: 列名列表,例如 ['FMA总得分_基线', 'FMA总得分_随访1', 'ADL总分_基线', 'ADL总分_随访1']
separators: 可选的分隔符列表,默认 ['___', '__', '_', '-', '.']
返回:
{
'success': bool,
'metric_groups': {
'FMA总得分': ['FMA总得分_基线', 'FMA总得分_随访1', ...],
'ADL总分': ['ADL总分_基线', 'ADL总分_随访1', ...],
...
},
'separator': str, # 检测到的分隔符
'timepoints': ['基线', '随访1', ...], # 所有时间点(应该每个指标都一致)
'confidence': float, # 置信度 0.0-1.0
'message': str
}
"""
print(f"\n🔍 开始自动检测多指标分组...", flush=True)
print(f" 输入列数: {len(column_names)}", flush=True)
if len(column_names) < 2:
return {
'success': False,
'metric_groups': {},
'separator': '',
'timepoints': [],
'confidence': 0.0,
'message': '至少需要2列才能检测分组'
}
if separators is None:
separators = ['___', '__', '_', '-', '.', '|', ' - ', ' ']
# ==================== 1. 尝试每个分隔符 ====================
detected_separator = None
metric_groups = defaultdict(list)
for sep in separators:
temp_groups = defaultdict(list)
failed = False
for col in column_names:
if sep not in col:
failed = True
break
# 分割列名
parts = col.split(sep)
if len(parts) < 2:
failed = True
break
# 第一部分作为指标名
metric_name = parts[0]
temp_groups[metric_name].append(col)
if not failed and len(temp_groups) > 0:
detected_separator = sep
metric_groups = temp_groups
print(f" ✓ 检测到分隔符: '{sep}'", flush=True)
break
if not detected_separator:
return {
'success': False,
'metric_groups': {},
'separator': '',
'timepoints': [],
'confidence': 0.0,
'message': '未检测到公共分隔符,请确认选中的列格式一致'
}
# ==================== 2. 提取每个指标的时间点 ====================
metric_timepoints = {}
for metric_name, cols in metric_groups.items():
timepoints = []
for col in cols:
# 提取时间点(分隔符后的部分)
parts = col.split(detected_separator)
if len(parts) >= 2:
# 使用最后一部分作为时间点(支持多级分隔,如 "FMA总得分_子项_基线"
timepoint = parts[-1].strip()
timepoints.append(timepoint)
metric_timepoints[metric_name] = timepoints
print(f" ✓ 检测到 {len(metric_groups)} 个指标:", flush=True)
for metric_name, cols in metric_groups.items():
print(f"{metric_name} ({len(cols)}列)", flush=True)
# ==================== 3. 验证时间点一致性 ====================
# 检查所有指标的时间点是否相同
all_timepoints = list(metric_timepoints.values())
first_timepoints = all_timepoints[0]
consistent = True
for tp_list in all_timepoints[1:]:
if tp_list != first_timepoints:
consistent = False
break
if not consistent:
print(f" ⚠️ 警告: 各指标的时间点不完全一致", flush=True)
# 使用所有时间点的并集
all_unique_timepoints = sorted(set(tp for tp_list in all_timepoints for tp in tp_list))
confidence = 0.6
message = f"检测到{len(metric_groups)}个指标但时间点不完全一致。将使用所有时间点的并集缺失值将填充为NA。"
else:
all_unique_timepoints = first_timepoints
confidence = 1.0
message = f"成功检测到{len(metric_groups)}个指标,共{len(all_unique_timepoints)}个时间点"
print(f" ✓ 检测到 {len(all_unique_timepoints)} 个时间点:", flush=True)
for i, tp in enumerate(all_unique_timepoints[:5]):
print(f" [{i+1}] {tp}", flush=True)
if len(all_unique_timepoints) > 5:
print(f" ... 还有 {len(all_unique_timepoints) - 5}", flush=True)
# ==================== 4. 计算置信度 ====================
# 检查:每个指标的列数是否相同
column_counts = [len(cols) for cols in metric_groups.values()]
if len(set(column_counts)) > 1:
confidence -= 0.2
print(f" ⚠️ 各指标的列数不同,降低置信度", flush=True)
return {
'success': True,
'metric_groups': dict(metric_groups),
'separator': detected_separator,
'timepoints': all_unique_timepoints,
'confidence': confidence,
'message': message
}
def apply_multi_metric_to_long(
df: pd.DataFrame,
id_vars: List[str],
metric_groups: Dict[str, List[str]],
separator: str,
event_col_name: str = 'Event_Name'
) -> pd.DataFrame:
"""
多指标转长表:时间点为行,指标为列
参数:
df: 原始数据框
id_vars: ID列列表
metric_groups: 指标分组字典,格式 {'FMA总得分': ['FMA总得分_基线', ...], ...}
separator: 分隔符
event_col_name: 时间点列的列名
返回:
转换后的数据框
示例:
输入:
Record_ID | FMA总得分_基线 | FMA总得分_随访1 | ADL总分_基线 | ADL总分_随访1
10 | 58 | 67 | 值1 | 值2
输出:
Record_ID | Event_Name | FMA总得分 | ADL总分
10 | 基线 | 58 | 值1
10 | 随访1 | 67 | 值2
"""
print(f"\n🔄 开始多指标转长表转换...", flush=True)
print(f" 原始形状: {df.shape}", flush=True)
print(f" ID列: {id_vars}", flush=True)
print(f" 指标数: {len(metric_groups)}", flush=True)
# ✨ 记录原始行的顺序保持原始Record ID顺序
df = df.copy()
df['_original_order'] = range(len(df))
# ==================== 1. 对每个指标执行 melt ====================
melted_dfs = []
for metric_name, cols in metric_groups.items():
print(f" • 处理指标: {metric_name} ({len(cols)}列)", flush=True)
# 提取该指标的数据(包含原始顺序列)
df_metric = df[id_vars + ['_original_order'] + cols].copy()
# Melt保留原始顺序列
df_melted = df_metric.melt(
id_vars=id_vars + ['_original_order'],
value_vars=cols,
var_name='_temp_col',
value_name=metric_name
)
# 提取时间点(移除分隔符前的指标名部分)
df_melted[event_col_name] = df_melted['_temp_col'].apply(
lambda x: x.split(separator)[-1].strip() if separator in x else x
)
# 删除临时列
df_melted = df_melted.drop('_temp_col', axis=1)
melted_dfs.append(df_melted)
# ==================== 2. Merge所有指标 ====================
print(f" • 合并 {len(melted_dfs)} 个指标的数据...", flush=True)
result = melted_dfs[0]
for i, df_metric in enumerate(melted_dfs[1:], 1):
result = result.merge(
df_metric,
on=id_vars + ['_original_order', event_col_name],
how='outer' # 外连接,保留所有时间点
)
# ==================== 3. 排序 ====================
# ✨ 按原始顺序和时间点排序保持原始Record ID顺序
result = result.sort_values(by=['_original_order', event_col_name]).reset_index(drop=True)
# 删除临时的原始顺序列
result = result.drop('_original_order', axis=1)
# ==================== 4. 调整列顺序 ====================
# 确保列顺序为ID列 → Event_Name → 所有指标列
metric_cols = [col for col in result.columns if col not in id_vars and col != event_col_name]
desired_column_order = id_vars + [event_col_name] + metric_cols
result = result[desired_column_order]
print(f" ✓ 转换完成!新形状: {result.shape}", flush=True)
print(f" ✓ 列顺序: {list(result.columns)}", flush=True)
return result
def preview_multi_metric_to_long(
df: pd.DataFrame,
id_vars: List[str],
value_vars: List[str],
separators: Optional[List[str]] = None,
event_col_name: str = 'Event_Name',
preview_rows: int = 10
) -> Dict[str, Any]:
"""
预览多指标转长表的结果
返回:
{
'success': bool,
'grouping': {...}, # detect_metric_groups的结果
'original_shape': (rows, cols),
'new_shape': (rows, cols),
'preview_data': [...],
'estimated_change': str
}
"""
print(f"\n📊 预览多指标转长表...", flush=True)
# 1. 检测分组
grouping = detect_metric_groups(value_vars, separators)
if not grouping['success']:
return {
'success': False,
'error': grouping['message']
}
# 2. 对前几行执行转换
preview_df = df.head(preview_rows)
try:
result_preview = apply_multi_metric_to_long(
preview_df,
id_vars,
grouping['metric_groups'],
grouping['separator'],
event_col_name
)
num_metrics = len(grouping['metric_groups'])
num_timepoints = len(grouping['timepoints'])
return {
'success': True,
'grouping': grouping,
'original_shape': (len(df), len(df.columns)),
'new_shape': (len(df) * num_timepoints, len(id_vars) + 1 + num_metrics),
'preview_data': result_preview.to_dict('records'),
'estimated_change': f"行数: {len(df)}{len(df) * num_timepoints} (每个ID复制{num_timepoints}次); 列数: {len(df.columns)}{len(id_vars) + 1 + num_metrics} (ID列 + 时间点列 + {num_metrics}个指标列)"
}
except Exception as e:
import traceback
print(f" ❌ 预览失败: {str(e)}", flush=True)
traceback.print_exc()
return {
'success': False,
'error': str(e)
}
# ==================== 多指标转换方向2时间点为列指标为行====================
def apply_multi_metric_to_matrix(
df: pd.DataFrame,
id_vars: List[str],
metric_groups: Dict[str, List[str]],
separator: str,
event_col_name: str = 'Event_Name',
metric_col_name: str = '指标名'
) -> pd.DataFrame:
"""
多指标转矩阵格式:时间点为列,指标为行
参数:
df: 原始数据框
id_vars: ID列列表
metric_groups: 指标分组字典
separator: 分隔符
event_col_name: 时间点列的列名(中间变量)
metric_col_name: 指标列的列名
返回:
转换后的数据框
示例:
输入:
Record_ID | FMA总得分_基线 | FMA总得分_随访1 | ADL总分_基线 | ADL总分_随访1
10 | 58 | 67 | 值1 | 值2
输出:
Record_ID | 指标名 | 基线 | 随访1
10 | FMA总得分 | 58 | 67
10 | ADL总分 | 值1 | 值2
"""
print(f"\n🔄 开始多指标转矩阵格式...", flush=True)
print(f" 原始形状: {df.shape}", flush=True)
print(f" ID列: {id_vars}", flush=True)
print(f" 指标数: {len(metric_groups)}", flush=True)
# ✨ 记录原始行的顺序保持原始Record ID顺序
# 创建ID到原始顺序的映射
df_with_order = df.copy()
df_with_order['_original_order'] = range(len(df_with_order))
# 创建ID列到原始顺序的映射字典
# 如果有多个ID列使用元组作为key
if len(id_vars) == 1:
id_to_order = df_with_order.set_index(id_vars[0])['_original_order'].to_dict()
else:
id_to_order = df_with_order.set_index(id_vars)['_original_order'].to_dict()
# ==================== 1. 先转成长表 ====================
df_long = apply_multi_metric_to_long(
df,
id_vars,
metric_groups,
separator,
event_col_name
)
print(f" • 长表形状: {df_long.shape}", flush=True)
# ==================== 2. 转成宽格式(指标为行,时间点为列)====================
# 先melt所有指标列变成 (ID, Event_Name, 指标名, 值) 格式
metric_cols = [col for col in df_long.columns if col not in id_vars and col != event_col_name]
print(f" • 准备pivot: {len(metric_cols)} 个指标列", flush=True)
# Melt将所有指标列转为行
df_melted = df_long.melt(
id_vars=id_vars + [event_col_name],
value_vars=metric_cols,
var_name=metric_col_name,
value_name='_value'
)
print(f" • Melt后形状: {df_melted.shape}", flush=True)
# Pivot时间点变成列
# 使用 pivot_table 而不是 pivot因为可能有重复索引
result = df_melted.pivot_table(
index=id_vars + [metric_col_name],
columns=event_col_name,
values='_value',
aggfunc='first' # 如果有重复,取第一个值
).reset_index()
# 清理列名(移除多级索引的名称)
result.columns.name = None
# ✨ 添加原始顺序列(用于排序)
if len(id_vars) == 1:
result['_original_order'] = result[id_vars[0]].map(id_to_order)
else:
# 多个ID列的情况创建元组作为key
result['_original_order'] = result[id_vars].apply(tuple, axis=1).map(id_to_order)
# ==================== 3. 调整列顺序 ====================
# 确保列顺序为ID列 → 指标名列 → 所有时间点列(按原始顺序)
timepoint_cols = [col for col in result.columns if col not in id_vars and col != metric_col_name]
# 尝试保持时间点的原始顺序(从 metric_groups 中获取)
first_metric_cols = list(metric_groups.values())[0]
original_timepoint_order = []
for col in first_metric_cols:
timepoint = col.split(separator)[-1].strip() if separator in col else col
if timepoint not in original_timepoint_order:
original_timepoint_order.append(timepoint)
# 按原始顺序排列时间点列
sorted_timepoint_cols = []
for tp in original_timepoint_order:
if tp in timepoint_cols:
sorted_timepoint_cols.append(tp)
# 添加任何未在原始顺序中的时间点(防御性编程)
for tp in timepoint_cols:
if tp not in sorted_timepoint_cols:
sorted_timepoint_cols.append(tp)
# ==================== 4. 排序 ====================
# ✨ 按原始顺序和指标名排序保持原始Record ID顺序
result = result.sort_values(by=['_original_order', metric_col_name]).reset_index(drop=True)
# 删除临时的原始顺序列
result = result.drop('_original_order', axis=1)
# ==================== 5. 调整列顺序 ====================
desired_column_order = id_vars + [metric_col_name] + sorted_timepoint_cols
result = result[desired_column_order]
print(f" ✓ 转换完成!新形状: {result.shape}", flush=True)
print(f" ✓ 列顺序: {list(result.columns)}", flush=True)
return result
def preview_multi_metric_to_matrix(
df: pd.DataFrame,
id_vars: List[str],
value_vars: List[str],
separators: Optional[List[str]] = None,
metric_col_name: str = '指标名',
preview_rows: int = 10
) -> Dict[str, Any]:
"""
预览多指标转矩阵格式的结果
返回:
{
'success': bool,
'grouping': {...}, # detect_metric_groups的结果
'original_shape': (rows, cols),
'new_shape': (rows, cols),
'preview_data': [...],
'estimated_change': str
}
"""
print(f"\n📊 预览多指标转矩阵格式...", flush=True)
# 1. 检测分组
grouping = detect_metric_groups(value_vars, separators)
if not grouping['success']:
return {
'success': False,
'error': grouping['message']
}
# 2. 对前几行执行转换
preview_df = df.head(preview_rows)
try:
result_preview = apply_multi_metric_to_matrix(
preview_df,
id_vars,
grouping['metric_groups'],
grouping['separator'],
'Event_Name',
metric_col_name
)
num_metrics = len(grouping['metric_groups'])
num_timepoints = len(grouping['timepoints'])
# 新行数 = 原始行数 × 指标数
estimated_new_rows = len(df) * num_metrics
# 新列数 = ID列数 + 1指标名列+ 时间点数
estimated_new_cols = len(id_vars) + 1 + num_timepoints
return {
'success': True,
'grouping': grouping,
'original_shape': (len(df), len(df.columns)),
'new_shape': (estimated_new_rows, estimated_new_cols),
'preview_data': result_preview.to_dict('records'),
'estimated_change': f"行数: {len(df)}{estimated_new_rows} (每个ID复制{num_metrics}每个指标1行); 列数: {len(df.columns)}{estimated_new_cols} (ID列 + 指标名列 + {num_timepoints}个时间点列)"
}
except Exception as e:
import traceback
print(f" ❌ 预览失败: {str(e)}", flush=True)
traceback.print_exc()
return {
'success': False,
'error': str(e)
}

View File

@@ -0,0 +1,289 @@
"""
宽表转长表Unpivot/Melt操作
提供数据重塑功能,将宽格式转换为长格式。
典型医学场景:
- 多时间点随访数据FMA_基线、FMA_2周 → 时间点列 + FMA值列
- 多指标合并分析(收缩压、舒张压 → 指标列 + 测量值列)
- 治疗组对比治疗组_NRS、对照组_NRS → 组别列 + NRS列
"""
import pandas as pd
import numpy as np
from typing import List, Optional, Dict, Any
import sys
def apply_unpivot(
df: pd.DataFrame,
id_vars: List[str],
value_vars: List[str],
var_name: str = '变量',
value_name: str = '',
parse_column_names: bool = False,
separator: str = '_',
metric_name: Optional[str] = None,
time_name: Optional[str] = None,
dropna: bool = False
) -> pd.DataFrame:
"""
应用宽表转长表转换
Args:
df: 输入数据框
id_vars: ID列保持不变的列
value_vars: 值列(需要转换的列)
var_name: 变量名列名(存储原列名)
value_name: 值列名(存储实际值)
parse_column_names: 是否解析列名(如"FMA_基线""FMA"+"基线"
separator: 列名分隔符
metric_name: 指标列名(解析列名时使用)
time_name: 时间列名(解析列名时使用)
dropna: 是否删除缺失值行
Returns:
转换后的长格式数据框
Examples:
>>> # 场景1多时间点随访数据
>>> df = pd.DataFrame({
... '患者ID': ['P001', 'P002'],
... '性别': ['', ''],
... 'FMA_基线': [32, 28],
... 'FMA_2周': [45, 38],
... 'FMA_1月': [52, 44]
... })
>>> result = apply_unpivot(
... df,
... id_vars=['患者ID', '性别'],
... value_vars=['FMA_基线', 'FMA_2周', 'FMA_1月'],
... var_name='时间点',
... value_name='FMA值'
... )
>>> len(result) # 2人 × 3个时间点 = 6行
6
>>> result.columns.tolist()
['患者ID', '性别', '时间点', 'FMA值']
>>> # 场景2带列名解析
>>> result = apply_unpivot(
... df,
... id_vars=['患者ID', '性别'],
... value_vars=['FMA_基线', 'FMA_2周', 'FMA_1月'],
... parse_column_names=True,
... separator='_',
... metric_name='指标',
... time_name='时间点',
... value_name='测量值'
... )
>>> result.columns.tolist()
['患者ID', '性别', '指标', '时间点', '测量值']
>>> result['指标'].unique().tolist()
['FMA']
>>> result['时间点'].unique().tolist()
['基线', '2周', '1月']
"""
print("\n" + "="*60, flush=True)
print("🔄 开始宽表转长表转换...", flush=True)
print("="*60, flush=True)
# ==================== 参数验证 ====================
if df.empty:
print("⚠️ 输入数据框为空", flush=True)
return df
if not id_vars:
raise ValueError('❌ 至少需要选择1个ID列标识列')
if len(value_vars) < 2:
raise ValueError('❌ 至少需要选择2个值列需要转换的列')
# 验证列是否存在
missing_id_cols = [col for col in id_vars if col not in df.columns]
if missing_id_cols:
raise KeyError(f"❌ ID列不存在: {', '.join(missing_id_cols)}")
missing_value_cols = [col for col in value_vars if col not in df.columns]
if missing_value_cols:
raise KeyError(f"❌ 值列不存在: {', '.join(missing_value_cols)}")
# 检查ID列和值列是否有重复
overlap = set(id_vars) & set(value_vars)
if overlap:
raise ValueError(f"❌ ID列和值列不能重复: {', '.join(overlap)}")
print(f"\n📊 转换前数据概况:", flush=True)
print(f" - 总行数: {len(df)}", flush=True)
print(f" - 总列数: {len(df.columns)}", flush=True)
print(f" - ID列: {len(id_vars)} 个 ({', '.join(id_vars[:3])}{'...' if len(id_vars) > 3 else ''})", flush=True)
print(f" - 值列: {len(value_vars)} 个 ({', '.join(value_vars[:3])}{'...' if len(value_vars) > 3 else ''})", flush=True)
# ==================== 基础转换使用pandas.melt====================
try:
result = pd.melt(
df,
id_vars=id_vars,
value_vars=value_vars,
var_name=var_name,
value_name=value_name
)
print(f"\n✅ 基础转换完成:", flush=True)
print(f" - 转换后行数: {len(result)} (原 {len(df)} × {len(value_vars)})", flush=True)
print(f" - 转换后列数: {len(result.columns)} (ID列 + 变量名列 + 值列)", flush=True)
except Exception as e:
print(f"❌ 转换失败: {str(e)}", flush=True)
raise
# ==================== 高级功能:解析列名 ====================
if parse_column_names and separator:
print(f"\n🔍 开始解析列名(分隔符: '{separator}'...", flush=True)
def parse_column_name(name: str):
"""
解析列名
Examples:
"FMA_基线" → ("FMA", "基线")
"血压_1月" → ("血压", "1月")
"NRS_治疗组_2周" → ("NRS", "治疗组_2周")
"""
parts = name.split(separator)
if len(parts) >= 2:
metric = parts[0]
time = separator.join(parts[1:])
return metric, time
else:
# 没有分隔符,整个作为指标名,时间点留空
return name, ''
try:
# 应用解析函数
parsed = result[var_name].apply(parse_column_name)
# 创建新列
metric_col = metric_name or '指标'
time_col = time_name or '时间点'
result[metric_col] = parsed.str[0]
result[time_col] = parsed.str[1]
# 删除原变量名列(已经拆分了)
result = result.drop(columns=[var_name])
# 统计解析结果
unique_metrics = result[metric_col].nunique()
unique_times = result[time_col].nunique()
print(f"✅ 列名解析完成:", flush=True)
print(f" - {metric_col}列: {unique_metrics} 个唯一值", flush=True)
print(f" - {time_col}列: {unique_times} 个唯一值", flush=True)
# 显示前3个解析示例
sample_original = value_vars[:3]
print(f"\n 解析示例:", flush=True)
for orig in sample_original:
metric, time = parse_column_name(orig)
print(f" - '{orig}'{metric_col}='{metric}', {time_col}='{time}'", flush=True)
except Exception as e:
print(f"⚠️ 列名解析失败: {str(e)}", flush=True)
print(f" 已保留原变量名列: {var_name}", flush=True)
# ==================== 删除缺失值行 ====================
if dropna:
original_len = len(result)
result = result.dropna(subset=[value_name])
dropped = original_len - len(result)
if dropped > 0:
print(f"\n🗑️ 删除缺失值行: {dropped} 行 ({dropped/original_len*100:.1f}%)", flush=True)
# ==================== 排序 ====================
# 排序按ID列排序保持患者分组
result = result.sort_values(id_vars).reset_index(drop=True)
print(f"\n✅ 排序完成: 按 {', '.join(id_vars[:2])}{'...' if len(id_vars) > 2 else ''} 排序", flush=True)
# ==================== 最终统计 ====================
print(f"\n{'='*60}", flush=True)
print(f"✅ 宽表转长表转换完成!", flush=True)
print(f"{'='*60}", flush=True)
print(f"📊 最终数据:", flush=True)
print(f" - 总行数: {len(result)} (扩展了 {len(result)/len(df):.1f}x)", flush=True)
print(f" - 总列数: {len(result.columns)}", flush=True)
print(f" - 列名: {', '.join(result.columns.tolist())}", flush=True)
# 显示前3行示例
print(f"\n 前3行数据示例:", flush=True)
for idx, row in result.head(3).iterrows():
row_str = ' | '.join([f"{col}={row[col]}" for col in result.columns[:4]])
print(f" [{idx}] {row_str}...", flush=True)
return result
def get_unpivot_preview(
df: pd.DataFrame,
id_vars: List[str],
value_vars: List[str],
var_name: str = '变量',
value_name: str = '',
preview_rows: int = 10
) -> Dict[str, Any]:
"""
获取转换预览信息(不实际执行完整转换)
Args:
df: 输入数据框
id_vars: ID列
value_vars: 值列
var_name: 变量名列名
value_name: 值列名
preview_rows: 预览行数
Returns:
{
'original_shape': (rows, cols),
'new_shape': (rows, cols),
'expansion_factor': 扩展倍数,
'preview_data': 前N行数据,
'estimated_change': '将从 100 行 × 15 列 转换为 500 行 × 5 列'
}
"""
original_rows = len(df)
original_cols = len(df.columns)
# 预估转换后的形状
new_rows = original_rows * len(value_vars)
new_cols = len(id_vars) + 2 # ID列 + 变量名列 + 值列
expansion_factor = len(value_vars)
# 生成前几行预览
preview_df = df.head(min(3, len(df)))
preview_result = pd.melt(
preview_df,
id_vars=id_vars,
value_vars=value_vars,
var_name=var_name,
value_name=value_name
)
return {
'original_shape': (original_rows, original_cols),
'new_shape': (new_rows, new_cols),
'expansion_factor': expansion_factor,
'preview_data': preview_result.head(preview_rows).to_dict('records'),
'estimated_change': f"将从 {original_rows}× {original_cols} 列 转换为 {new_rows}× {new_cols}"
}

View File

@@ -291,3 +291,9 @@ if __name__ == "__main__":

View File

@@ -57,3 +57,9 @@ except Exception as e:

View File

@@ -37,3 +37,9 @@ except Exception as e: