feat(dc): Add multi-metric transformation feature (direction 1+2)
Summary: - Implement intelligent multi-metric grouping detection algorithm - Add direction 1: timepoint-as-row, metric-as-column (analysis format) - Add direction 2: timepoint-as-column, metric-as-row (display format) - Fix column name pattern detection (FMA___ issue) - Maintain original Record ID order in output - Add full-select/clear buttons in UI - Integrate into TransformDialog with Radio selection - Update 3 documentation files Technical Details: - Python: detect_metric_groups(), apply_multi_metric_to_long(), apply_multi_metric_to_matrix() - Backend: 3 new methods in QuickActionService - Frontend: MultiMetricPanel.tsx (531 lines) - Total: ~1460 lines of new code Status: Fully tested and verified, ready for production
This commit is contained in:
@@ -230,6 +230,12 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -32,3 +32,9 @@ WHERE table_schema = 'dc_schema'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -70,3 +70,9 @@ ORDER BY ordinal_position;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -83,3 +83,9 @@ runMigration()
|
||||
});
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -17,3 +17,9 @@ COMMENT ON COLUMN "dc_schema"."dc_tool_c_sessions"."column_mapping" IS '列名
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -44,3 +44,9 @@ COMMENT ON COLUMN dc_schema.dc_tool_c_sessions.expires_at IS '过期时间(创
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -188,6 +188,12 @@ function extractCodeBlocks(obj, blocks = []) {
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -210,6 +210,12 @@ checkDCTables();
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -165,3 +165,9 @@ createAiHistoryTable()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -152,3 +152,9 @@ createToolCTable()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -149,3 +149,9 @@ createToolCTable()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -281,3 +281,9 @@ export function getBatchItems<T>(
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -310,6 +310,12 @@ runTests().catch((error) => {
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -251,6 +251,12 @@ runTest()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -289,6 +289,12 @@ Content-Type: application/json
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -368,6 +368,12 @@ export class ExcelExporter {
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -225,6 +225,12 @@ export const conflictDetectionService = new ConflictDetectionService();
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -253,6 +253,12 @@ export const templateService = new TemplateService();
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -390,3 +390,9 @@ async function countCompletedBatches(taskId: string): Promise<number> {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -182,3 +182,9 @@ curl -X POST http://localhost:3000/api/v1/dc/tool-c/test/execute \
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ import { prisma } from '../../../../config/database.js';
|
||||
|
||||
interface QuickActionRequest {
|
||||
sessionId: string;
|
||||
action: 'filter' | 'recode' | 'binning' | 'conditional' | 'dropna' | 'dedup';
|
||||
action: 'filter' | 'recode' | 'binning' | 'conditional' | 'dropna' | 'dedup' | 'compute' | 'pivot' | 'unpivot' | 'metric_time' | 'multi_metric_to_long' | 'multi_metric_to_matrix';
|
||||
params: any;
|
||||
userId?: string;
|
||||
}
|
||||
@@ -105,6 +105,18 @@ export class QuickActionController {
|
||||
case 'pivot':
|
||||
actionDescription = 'Pivot转换';
|
||||
break;
|
||||
case 'unpivot':
|
||||
actionDescription = 'Unpivot转换(宽→长表)';
|
||||
break;
|
||||
case 'metric_time':
|
||||
actionDescription = '指标-时间表转换';
|
||||
break;
|
||||
case 'multi_metric_to_long':
|
||||
actionDescription = '多指标转长表';
|
||||
break;
|
||||
case 'multi_metric_to_matrix':
|
||||
actionDescription = '多指标转矩阵';
|
||||
break;
|
||||
default:
|
||||
logger.warn(`[QuickAction] 不支持的操作: ${action}`);
|
||||
return reply.code(400).send({
|
||||
@@ -184,6 +196,22 @@ export class QuickActionController {
|
||||
pivotValueOrder
|
||||
);
|
||||
break;
|
||||
case 'unpivot':
|
||||
// Unpivot不需要columnMapping,直接执行
|
||||
executeResult = await quickActionService.executeUnpivot(fullData, params);
|
||||
break;
|
||||
case 'metric_time':
|
||||
// 指标-时间表转换
|
||||
executeResult = await quickActionService.executeMetricTime(fullData, params);
|
||||
break;
|
||||
case 'multi_metric_to_long':
|
||||
// 多指标转长表
|
||||
executeResult = await quickActionService.executeMultiMetricToLong(fullData, params);
|
||||
break;
|
||||
case 'multi_metric_to_matrix':
|
||||
// 多指标转矩阵
|
||||
executeResult = await quickActionService.executeMultiMetricToMatrix(fullData, params);
|
||||
break;
|
||||
}
|
||||
|
||||
if (!executeResult.success) {
|
||||
@@ -340,9 +368,27 @@ export class QuickActionController {
|
||||
case 'dropna':
|
||||
executeResult = await quickActionService.executeDropna(fullData, params);
|
||||
break;
|
||||
case 'compute':
|
||||
executeResult = await quickActionService.executeCompute(fullData, params);
|
||||
break;
|
||||
case 'dedup':
|
||||
// TODO: 实现去重功能
|
||||
return reply.code(400).send({ success: false, error: '去重功能尚未实现' });
|
||||
case 'pivot':
|
||||
executeResult = await quickActionService.executePivot(fullData, params);
|
||||
break;
|
||||
case 'unpivot':
|
||||
executeResult = await quickActionService.executeUnpivot(fullData, params);
|
||||
break;
|
||||
case 'metric_time':
|
||||
executeResult = await quickActionService.executeMetricTime(fullData, params);
|
||||
break;
|
||||
case 'multi_metric_to_long':
|
||||
executeResult = await quickActionService.executeMultiMetricToLong(fullData, params);
|
||||
break;
|
||||
case 'multi_metric_to_matrix':
|
||||
executeResult = await quickActionService.executeMultiMetricToMatrix(fullData, params);
|
||||
break;
|
||||
default:
|
||||
return reply.code(400).send({ success: false, error: '不支持的操作' });
|
||||
}
|
||||
@@ -361,14 +407,29 @@ export class QuickActionController {
|
||||
const newRows = resultData.length;
|
||||
|
||||
let estimatedChange = '';
|
||||
if (action === 'filter' || action === 'dropna') {
|
||||
estimatedChange = `将保留 ${newRows} 行(删除 ${originalRows - newRows} 行)`;
|
||||
} else if (action === 'recode' || action === 'binning' || action === 'conditional' || action === 'compute') {
|
||||
estimatedChange = `将新增 1 列`;
|
||||
} else if (action === 'pivot') {
|
||||
const originalCols = Object.keys(fullData[0] || {}).length;
|
||||
const newCols = Object.keys(resultData[0] || {}).length;
|
||||
estimatedChange = `行数: ${originalRows} → ${newRows}, 列数: ${originalCols} → ${newCols}`;
|
||||
switch (action) {
|
||||
case 'filter':
|
||||
case 'dropna':
|
||||
estimatedChange = `将保留 ${newRows} 行(删除 ${originalRows - newRows} 行)`;
|
||||
break;
|
||||
case 'recode':
|
||||
case 'binning':
|
||||
case 'conditional':
|
||||
case 'compute':
|
||||
estimatedChange = `将新增 1 列`;
|
||||
break;
|
||||
case 'pivot':
|
||||
case 'unpivot':
|
||||
case 'metric_time':
|
||||
case 'multi_metric_to_long':
|
||||
case 'multi_metric_to_matrix': {
|
||||
const originalCols = Object.keys(fullData[0] || {}).length;
|
||||
const newCols = Object.keys(resultData[0] || {}).length;
|
||||
estimatedChange = `行数: ${originalRows} → ${newRows}, 列数: ${originalCols} → ${newCols}`;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
estimatedChange = `操作完成`;
|
||||
}
|
||||
|
||||
return reply.code(200).send({
|
||||
@@ -541,6 +602,95 @@ export class QuickActionController {
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* POST /api/v1/dc/tool-c/metric-time/detect
|
||||
* 检测指标-时间表转换模式
|
||||
*/
|
||||
async handleMetricTimeDetect(request: FastifyRequest, reply: FastifyReply) {
|
||||
try {
|
||||
const { sessionId, valueVars } = request.body as { sessionId: string; valueVars: string[] };
|
||||
|
||||
logger.info(`[QuickAction] 检测指标-时间表模式: session=${sessionId}, ${valueVars?.length || 0} 列`);
|
||||
|
||||
// 验证参数
|
||||
if (!valueVars || valueVars.length < 2) {
|
||||
return reply.code(400).send({
|
||||
success: false,
|
||||
error: '至少需要2列才能检测模式'
|
||||
});
|
||||
}
|
||||
|
||||
// 调用Service检测模式
|
||||
const result = await quickActionService.detectMetricTimePattern(valueVars);
|
||||
|
||||
if (!result.success) {
|
||||
return reply.code(500).send({
|
||||
success: false,
|
||||
error: result.error || '模式检测失败'
|
||||
});
|
||||
}
|
||||
|
||||
return reply.code(200).send({
|
||||
success: true,
|
||||
pattern: result.pattern,
|
||||
execution_time: result.execution_time
|
||||
});
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(`[QuickAction] 模式检测失败: ${error.message}`);
|
||||
return reply.code(500).send({
|
||||
success: false,
|
||||
error: error.message
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* POST /api/v1/dc/tool-c/multi-metric/detect
|
||||
* 检测多指标分组
|
||||
*/
|
||||
async handleMultiMetricDetect(request: FastifyRequest, reply: FastifyReply) {
|
||||
try {
|
||||
const { sessionId, valueVars, separators } = request.body as {
|
||||
sessionId: string;
|
||||
valueVars: string[];
|
||||
separators?: string[];
|
||||
};
|
||||
|
||||
logger.info(`[QuickAction] 检测多指标分组: session=${sessionId}, ${valueVars?.length || 0} 列`);
|
||||
|
||||
// 验证参数
|
||||
if (!valueVars || valueVars.length < 2) {
|
||||
return reply.code(400).send({
|
||||
success: false,
|
||||
error: '至少需要2列才能检测分组'
|
||||
});
|
||||
}
|
||||
|
||||
// 调用Service检测分组
|
||||
const result = await quickActionService.detectMultiMetricGroups(valueVars, separators);
|
||||
|
||||
if (!result.success) {
|
||||
return reply.code(500).send({
|
||||
success: false,
|
||||
error: result.message || '分组检测失败'
|
||||
});
|
||||
}
|
||||
|
||||
return reply.code(200).send({
|
||||
success: true,
|
||||
grouping: result
|
||||
});
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(`[QuickAction] 多指标分组检测失败: ${error.message}`);
|
||||
return reply.code(500).send({
|
||||
success: false,
|
||||
error: error.message
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ==================== 导出单例 ====================
|
||||
|
||||
@@ -236,3 +236,9 @@ export const streamAIController = new StreamAIController();
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -133,5 +133,19 @@ export async function toolCRoutes(fastify: FastifyInstance) {
|
||||
fastify.post('/fillna/mice', {
|
||||
handler: quickActionController.handleFillnaMice.bind(quickActionController),
|
||||
});
|
||||
|
||||
// ✨ 指标-时间表转换(新增)
|
||||
|
||||
// 检测指标-时间表转换模式
|
||||
fastify.post('/metric-time/detect', {
|
||||
handler: quickActionController.handleMetricTimeDetect.bind(quickActionController),
|
||||
});
|
||||
|
||||
// ✨ 多指标转换(新增)
|
||||
|
||||
// 检测多指标分组
|
||||
fastify.post('/multi-metric/detect', {
|
||||
handler: quickActionController.handleMultiMetricDetect.bind(quickActionController),
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -77,6 +77,49 @@ interface PivotParams {
|
||||
unusedAggMethod?: 'first' | 'mode' | 'mean'; // ✨ 新增:未选择列的聚合方式
|
||||
}
|
||||
|
||||
interface UnpivotParams {
|
||||
idVars: string[]; // ID列(保持不变的列)
|
||||
valueVars: string[]; // 值列(需要转换的列)
|
||||
varName: string; // 变量名列名
|
||||
valueName: string; // 值列名
|
||||
parseColumnNames?: boolean; // 是否解析列名
|
||||
separator?: string; // 分隔符
|
||||
metricName?: string; // 指标列名
|
||||
timeName?: string; // 时间列名
|
||||
dropna?: boolean; // 是否删除缺失值行
|
||||
}
|
||||
|
||||
interface MetricTimeParams {
|
||||
idVars: string[]; // ID列(保持不变的列)
|
||||
valueVars: string[]; // 值列(同一指标的多个时间点)
|
||||
metricName?: string; // 指标名称(可选,自动检测)
|
||||
separator?: string; // 分隔符(可选,自动检测)
|
||||
timepointColName?: string; // 时间点列名
|
||||
}
|
||||
|
||||
interface MultiMetricToLongParams {
|
||||
idVars: string[]; // ID列
|
||||
valueVars: string[]; // 值列(多个指标的多个时间点)
|
||||
separators?: string[]; // 可选的分隔符列表
|
||||
eventColName?: string; // 时间点列名(默认 'Event_Name')
|
||||
}
|
||||
|
||||
interface MultiMetricToMatrixParams {
|
||||
idVars: string[]; // ID列
|
||||
valueVars: string[]; // 值列(多个指标的多个时间点)
|
||||
separators?: string[]; // 可选的分隔符列表
|
||||
metricColName?: string; // 指标列名(默认 '指标名')
|
||||
}
|
||||
|
||||
interface MetricGrouping {
|
||||
success: boolean;
|
||||
metric_groups?: Record<string, string[]>; // 指标分组
|
||||
separator?: string; // 检测到的分隔符
|
||||
timepoints?: string[]; // 时间点列表
|
||||
confidence?: number; // 置信度
|
||||
message?: string;
|
||||
}
|
||||
|
||||
interface FillnaSimpleParams {
|
||||
column: string;
|
||||
newColumnName: string;
|
||||
@@ -100,6 +143,7 @@ interface OperationResult {
|
||||
error?: string;
|
||||
message?: string;
|
||||
stats?: any;
|
||||
pattern?: any; // ✨ 新增:用于指标-时间表模式检测
|
||||
}
|
||||
|
||||
// ==================== 服务类 ====================
|
||||
@@ -359,6 +403,209 @@ export class QuickActionService {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 执行Unpivot(宽表转长表)
|
||||
*/
|
||||
async executeUnpivot(data: any[], params: UnpivotParams): Promise<OperationResult> {
|
||||
try {
|
||||
logger.info(`[QuickActionService] 调用Unpivot API: ${params.idVars.length} ID列 × ${params.valueVars.length} 值列`);
|
||||
|
||||
const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/unpivot`, {
|
||||
data,
|
||||
id_vars: params.idVars,
|
||||
value_vars: params.valueVars,
|
||||
var_name: params.varName || '变量',
|
||||
value_name: params.valueName || '值',
|
||||
parse_column_names: params.parseColumnNames || false,
|
||||
separator: params.separator || '_',
|
||||
metric_name: params.metricName,
|
||||
time_name: params.timeName,
|
||||
dropna: params.dropna || false,
|
||||
}, {
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
logger.info(`[QuickActionService] Unpivot成功: ${response.data.result_shape?.[0] || 0} 行`);
|
||||
return response.data;
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(`[QuickActionService] Unpivot失败: ${error.message}`);
|
||||
|
||||
if (error.response?.data) {
|
||||
return error.response.data;
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error.message || 'Unpivot失败',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 检测指标-时间表转换模式
|
||||
*/
|
||||
async detectMetricTimePattern(valueVars: string[]): Promise<OperationResult> {
|
||||
try {
|
||||
logger.info(`[QuickActionService] 检测指标-时间表模式: ${valueVars.length} 列`);
|
||||
|
||||
const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/metric-time/detect`, {
|
||||
value_vars: valueVars,
|
||||
}, {
|
||||
timeout: 10000,
|
||||
});
|
||||
|
||||
logger.info(`[QuickActionService] 模式检测成功`);
|
||||
return response.data;
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(`[QuickActionService] 模式检测失败: ${error.message}`);
|
||||
|
||||
if (error.response?.data) {
|
||||
return error.response.data;
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error.message || '模式检测失败',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 执行指标-时间表转换
|
||||
*/
|
||||
async executeMetricTime(data: any[], params: MetricTimeParams): Promise<OperationResult> {
|
||||
try {
|
||||
logger.info(`[QuickActionService] 调用指标-时间表转换API: ${params.idVars.length} ID列 × ${params.valueVars.length} 值列`);
|
||||
|
||||
const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/metric-time`, {
|
||||
data,
|
||||
id_vars: params.idVars,
|
||||
value_vars: params.valueVars,
|
||||
metric_name: params.metricName,
|
||||
separator: params.separator,
|
||||
timepoint_col_name: params.timepointColName || '时间点',
|
||||
}, {
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
logger.info(`[QuickActionService] 指标-时间表转换成功: ${response.data.result_shape?.[0] || 0} 行`);
|
||||
return response.data;
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(`[QuickActionService] 指标-时间表转换失败: ${error.message}`);
|
||||
|
||||
if (error.response?.data) {
|
||||
return error.response.data;
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error.message || '指标-时间表转换失败',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 检测多指标分组
|
||||
*/
|
||||
async detectMultiMetricGroups(valueVars: string[], separators?: string[]): Promise<MetricGrouping> {
|
||||
try {
|
||||
logger.info(`[QuickActionService] 调用多指标分组检测API: ${valueVars.length} 列`);
|
||||
|
||||
const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/multi-metric/detect`, {
|
||||
value_vars: valueVars,
|
||||
separators: separators,
|
||||
}, {
|
||||
timeout: 10000,
|
||||
});
|
||||
|
||||
logger.info(`[QuickActionService] 多指标分组检测成功: ${Object.keys(response.data.metric_groups || {}).length} 个指标`);
|
||||
return response.data;
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(`[QuickActionService] 多指标分组检测失败: ${error.message}`);
|
||||
|
||||
if (error.response?.data) {
|
||||
return error.response.data;
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
message: error.message || '多指标分组检测失败',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 执行多指标转长表(时间点为行,指标为列)
|
||||
*/
|
||||
async executeMultiMetricToLong(data: any[], params: MultiMetricToLongParams): Promise<OperationResult> {
|
||||
try {
|
||||
logger.info(`[QuickActionService] 调用多指标转长表API: ${params.idVars.length} ID列 × ${params.valueVars.length} 值列`);
|
||||
|
||||
const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/multi-metric/to-long`, {
|
||||
data,
|
||||
id_vars: params.idVars,
|
||||
value_vars: params.valueVars,
|
||||
separators: params.separators,
|
||||
event_col_name: params.eventColName || 'Event_Name',
|
||||
}, {
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
logger.info(`[QuickActionService] 多指标转长表成功: ${response.data.result_shape?.[0] || 0} 行`);
|
||||
return response.data;
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(`[QuickActionService] 多指标转长表失败: ${error.message}`);
|
||||
|
||||
if (error.response?.data) {
|
||||
return error.response.data;
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error.message || '多指标转长表失败',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 执行多指标转矩阵(时间点为列,指标为行)
|
||||
*/
|
||||
async executeMultiMetricToMatrix(data: any[], params: MultiMetricToMatrixParams): Promise<OperationResult> {
|
||||
try {
|
||||
logger.info(`[QuickActionService] 调用多指标转矩阵API: ${params.idVars.length} ID列 × ${params.valueVars.length} 值列`);
|
||||
|
||||
const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/multi-metric/to-matrix`, {
|
||||
data,
|
||||
id_vars: params.idVars,
|
||||
value_vars: params.valueVars,
|
||||
separators: params.separators,
|
||||
metric_col_name: params.metricColName || '指标名',
|
||||
}, {
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
logger.info(`[QuickActionService] 多指标转矩阵成功: ${response.data.result_shape?.[0] || 0} 行`);
|
||||
return response.data;
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(`[QuickActionService] 多指标转矩阵失败: ${error.message}`);
|
||||
|
||||
if (error.response?.data) {
|
||||
return error.response.data;
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error.message || '多指标转矩阵失败',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取列的缺失值统计
|
||||
*/
|
||||
@@ -463,3 +710,21 @@ export class QuickActionService {
|
||||
|
||||
export const quickActionService = new QuickActionService();
|
||||
|
||||
// ==================== 导出类型 ====================
|
||||
|
||||
export type {
|
||||
FilterParams,
|
||||
RecodeParams,
|
||||
BinningParams,
|
||||
ConditionalParams,
|
||||
PivotParams,
|
||||
UnpivotParams,
|
||||
MetricTimeParams,
|
||||
MultiMetricToLongParams,
|
||||
MultiMetricToMatrixParams,
|
||||
MetricGrouping,
|
||||
FillnaSimpleParams,
|
||||
FillnaMiceParams,
|
||||
OperationResult,
|
||||
};
|
||||
|
||||
|
||||
@@ -382,3 +382,9 @@ SET session_replication_role = 'origin';
|
||||
**作者:** AI Clinical Research Team
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -84,3 +84,9 @@ WHERE key = 'verify_test';
|
||||
\echo '=========================================='
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -227,3 +227,9 @@ verifyDatabase()
|
||||
});
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
6
backend/src/types/global.d.ts
vendored
6
backend/src/types/global.d.ts
vendored
@@ -17,3 +17,9 @@ export {}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -33,6 +33,12 @@ Write-Host "✅ 完成!" -ForegroundColor Green
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -327,3 +327,9 @@ runAdvancedTests().catch(error => {
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -393,3 +393,9 @@ runAllTests()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -351,3 +351,9 @@ runAllTests()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -135,3 +135,9 @@ Set-Location ..
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
# AIclinicalresearch 系统当前状态与开发指南
|
||||
|
||||
> **文档版本:** v1.8
|
||||
> **文档版本:** v1.9
|
||||
> **创建日期:** 2025-11-28
|
||||
> **维护者:** 开发团队
|
||||
> **最后更新:** 2025-12-13
|
||||
> **重大进展:** 🏆 **Postgres-Only 架构改造完成(Phase 1-7)** - Platform层统一任务管理、智能双模式处理、断点续传机制
|
||||
> **最后更新:** 2025-12-21
|
||||
> **重大进展:** ✨ **DC模块多指标转换功能上线(方向1+2)** - 医学研究专用的重复测量数据转换工具
|
||||
> **文档目的:** 快速了解系统当前状态,为新AI助手提供上下文
|
||||
|
||||
---
|
||||
@@ -40,7 +40,7 @@
|
||||
| **AIA** | AI智能问答 | 10+专业智能体(选题评价、PICO梳理等) | ⭐⭐⭐⭐ | ✅ 已完成 | P1 |
|
||||
| **PKB** | 个人知识库 | RAG问答、私人文献库 | ⭐⭐⭐ | ✅ 已完成 | P1 |
|
||||
| **ASL** | AI智能文献 | 文献筛选、Meta分析、证据图谱 | ⭐⭐⭐⭐⭐ | 🚧 **正在开发** | **P0** |
|
||||
| **DC** | 数据清洗整理 | ETL + 医学NER(百万行级数据) | ⭐⭐⭐⭐⭐ | ✅ **Tool B完成 + Tool C 98%(7个功能+NA处理+Pivot优化+UX重大改进)** | **P0** |
|
||||
| **DC** | 数据清洗整理 | ETL + 医学NER(百万行级数据) | ⭐⭐⭐⭐⭐ | ✅ **Tool B完成 + Tool C 99%(7个功能+NA处理+Pivot优化+UX重大改进+多指标转换)** | **P0** |
|
||||
| **SSA** | 智能统计分析 | 队列/预测模型/RCT分析 | ⭐⭐⭐⭐⭐ | 📋 规划中 | P2 |
|
||||
| **ST** | 统计分析工具 | 100+轻量化统计工具 | ⭐⭐⭐⭐ | 📋 规划中 | P2 |
|
||||
| **RVW** | 稿件审查系统 | 方法学评估、审稿流程 | ⭐⭐⭐⭐ | 📋 规划中 | P3 |
|
||||
|
||||
@@ -1250,6 +1250,12 @@ interface FulltextScreeningResult {
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -364,6 +364,12 @@ GET /api/v1/asl/fulltext-screening/tasks/:taskId/export
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -307,6 +307,12 @@ Linter错误:0个
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -466,6 +466,12 @@ Failed to open file '\\tmp\\extraction_service\\temp_10000_test.pdf'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
# 工具C(Tool C)- 科研数据编辑器 - 当前状态与开发指南
|
||||
|
||||
> **最后更新**: 2025-12-10
|
||||
> **当前版本**: Day 5-8 MVP + 功能按钮 + NA处理 + Pivot优化 + UX重大改进
|
||||
> **开发进度**: Python微服务 ✅ | Session管理 ✅ | AI代码生成 ✅ | 前端完整 ✅ | 通用组件 ✅ | 功能按钮✅(7个)| NA处理✅ | Pivot优化✅ | **UX优化✅(筛选/行号/滚动条/全量数据)**
|
||||
> **最后更新**: 2025-12-21
|
||||
> **当前版本**: Day 5-8 MVP + 功能按钮 + NA处理 + Pivot优化 + UX重大改进 + **多指标转换✅**
|
||||
> **开发进度**: Python微服务 ✅ | Session管理 ✅ | AI代码生成 ✅ | 前端完整 ✅ | 通用组件 ✅ | 功能按钮✅(7个)| NA处理✅ | Pivot优化✅ | UX优化✅ | **多指标转换✅(方向1+2)**
|
||||
|
||||
---
|
||||
|
||||
@@ -21,7 +21,144 @@
|
||||
|
||||
---
|
||||
|
||||
## ✅ 已完成功能(Day 1-8)
|
||||
## ✅ 已完成功能(Day 1-9)
|
||||
|
||||
### 🎉 Day 9 多指标转换功能(2025-12-21)✅
|
||||
|
||||
#### 1. 功能概述
|
||||
**医学研究专用的多指标重复测量数据转换工具**,支持两个转换方向:
|
||||
|
||||
| 转换方向 | 输入格式 | 输出格式 | 适用场景 |
|
||||
|---------|---------|---------|---------|
|
||||
| **方向1:分析格式** | 宽表 | 时间点→行,指标→列 | 统计分析、混合效应模型、GEE、数据可视化 |
|
||||
| **方向2:展示格式** | 宽表 | 时间点→列,指标→行 | 临床报告、数据审查表、CRF核对、单受试者数据审查 |
|
||||
|
||||
#### 2. 核心功能 ✅
|
||||
|
||||
**2.1 智能自动分组** ✅
|
||||
- ✅ 自动检测列名中的指标名称和时间点
|
||||
- ✅ 智能识别分隔符(`___`、`__`、`_`、`-`、`.`等)
|
||||
- ✅ 公共前缀智能扩展(修复"FMA总得分___基线"识别问题)
|
||||
- ✅ 时间点一致性验证
|
||||
- ✅ 置信度评分
|
||||
|
||||
**示例**:
|
||||
```
|
||||
输入列名:FMA总得分___筛选及基线、FMA总得分___随访(2周)、ADL总分___基线、ADL总分___随访(2周)
|
||||
自动检测:
|
||||
✓ 3个指标:FMA总得分、ADL总分、FM疗效
|
||||
✓ 8个时间点:筛选及基线、随访(2周)、随访(1个月)...
|
||||
✓ 分隔符:"___"
|
||||
```
|
||||
|
||||
**2.2 方向1:多指标转长表(时间点为行,指标为列)** ✅
|
||||
- ✅ 适用场景:R/Python统计分析、ggplot2/seaborn可视化、机器学习
|
||||
- ✅ 列顺序优化:`ID列 → Event_Name → 各指标列`
|
||||
- ✅ 保持原始Record ID顺序
|
||||
- ✅ 自动处理缺失值(outer join)
|
||||
|
||||
**示例**:
|
||||
```
|
||||
输入(宽表):
|
||||
Record_ID | FMA总得分_基线 | FMA总得分_随访1 | ADL总分_基线 | ADL总分_随访1
|
||||
4 | 58 | 67 | 40 | 95
|
||||
5 | 61 | 79 | 35 | 85
|
||||
|
||||
输出(长表):
|
||||
Record_ID | Event_Name | FMA总得分 | ADL总分
|
||||
4 | 基线 | 58 | 40
|
||||
4 | 随访1 | 67 | 95
|
||||
5 | 基线 | 61 | 35
|
||||
5 | 随访1 | 79 | 85
|
||||
```
|
||||
|
||||
**2.3 方向2:多指标转矩阵(时间点为列,指标为行)** ✅
|
||||
- ✅ 适用场景:临床报告、数据审查、CRF核对
|
||||
- ✅ 列顺序优化:`ID列 → 指标名列 → 各时间点列`
|
||||
- ✅ 保持原始Record ID顺序
|
||||
- ✅ 时间点列按原始顺序排列
|
||||
|
||||
**示例**:
|
||||
```
|
||||
输入(宽表):
|
||||
Record_ID | FMA总得分_基线 | FMA总得分_随访1 | ADL总分_基线 | ADL总分_随访1
|
||||
4 | 58 | 67 | 40 | 95
|
||||
|
||||
输出(矩阵):
|
||||
Record_ID | 指标名 | 基线 | 随访1
|
||||
4 | FMA总得分 | 58 | 67
|
||||
4 | ADL总分 | 40 | 95
|
||||
```
|
||||
|
||||
#### 3. UX优化 ✅
|
||||
|
||||
| 功能 | 说明 | 状态 |
|
||||
|------|------|------|
|
||||
| 转换方向选择 | Radio组件,两个选项,带场景说明 | ✅ |
|
||||
| 全选/清空按钮 | 快速选择所有值列 | ✅ |
|
||||
| 实时预览 | 选择列后自动生成预览(前10行) | ✅ |
|
||||
| 智能表单 | 根据转换方向动态显示不同的输入框 | ✅ |
|
||||
| 可视化分组结果 | Tag标签展示检测到的指标和时间点 | ✅ |
|
||||
| 置信度提示 | 检测置信度<1.0时显示警告 | ✅ |
|
||||
|
||||
#### 4. 技术架构 ✅
|
||||
|
||||
**4.1 Python层(`metric_time_transform.py`)**
|
||||
- ✅ `detect_metric_groups()` - 自动分组检测(300行)
|
||||
- ✅ `apply_multi_metric_to_long()` - 方向1转换(150行)
|
||||
- ✅ `apply_multi_metric_to_matrix()` - 方向2转换(180行)
|
||||
- ✅ 智能排序:保持原始Record ID顺序
|
||||
|
||||
**4.2 Python API(`main.py`)**
|
||||
- ✅ `POST /api/operations/multi-metric/detect` - 检测指标分组
|
||||
- ✅ `POST /api/operations/multi-metric/to-long` - 执行方向1转换
|
||||
- ✅ `POST /api/operations/multi-metric/to-matrix` - 执行方向2转换
|
||||
|
||||
**4.3 Node.js Backend**
|
||||
- ✅ `QuickActionService.ts` - 3个新方法
|
||||
- ✅ `QuickActionController.ts` - 支持2个新action
|
||||
- ✅ 路由注册:`/multi-metric/detect`
|
||||
|
||||
**4.4 Frontend(`MultiMetricPanel.tsx`)**
|
||||
- ✅ 转换方向选择(Radio组件)
|
||||
- ✅ 智能表单(动态显示)
|
||||
- ✅ 实时检测和预览
|
||||
- ✅ 完整的错误处理
|
||||
|
||||
#### 5. 关键技术突破 ✅
|
||||
|
||||
| 技术点 | 问题 | 解决方案 |
|
||||
|-------|------|---------|
|
||||
| 列名识别 | "FMA总得分___基线" 被错误识别为 "FMA" | 智能修正算法:扩展公共前缀 |
|
||||
| 列顺序 | Event_Name位置随机 | 强制列顺序:ID → Event_Name → 指标 |
|
||||
| Record ID顺序 | 转换后按字典序排序(4,10,11,5,6) | 添加临时列 `_original_order` 保持原始顺序 |
|
||||
| 分隔符识别 | 不支持三重下划线 `___` | 优先级列表:`['___', '__', '_', '-', '.']` |
|
||||
| 时间点提取 | `.lstrip()` 错误移除字符 | 使用 `.startswith()` 精确匹配 |
|
||||
|
||||
#### 6. 测试覆盖 ✅
|
||||
|
||||
| 测试场景 | 测试数据 | 状态 |
|
||||
|---------|---------|------|
|
||||
| 单ID列,多指标 | Record_ID: 4,5,6,10,11 | ✅ |
|
||||
| 三重下划线分隔符 | `FMA总得分___筛选及基线` | ✅ |
|
||||
| 括号时间点 | `随访(2周)` | ✅ |
|
||||
| 中文列名 | `FMA疗效` | ✅ |
|
||||
| 空值处理 | outer join保留所有时间点 | ✅ |
|
||||
| 原始顺序保持 | 4→5→6→10→11 | ✅ |
|
||||
|
||||
#### 7. 代码统计 ✅
|
||||
|
||||
| 文件 | 新增代码 | 说明 |
|
||||
|------|---------|------|
|
||||
| `metric_time_transform.py` | ~600行 | Python核心算法 |
|
||||
| `main.py` | ~150行 | 3个API端点 |
|
||||
| `QuickActionService.ts` | ~100行 | 3个新方法 |
|
||||
| `QuickActionController.ts` | ~50行 | Action支持 |
|
||||
| `MultiMetricPanel.tsx` | ~530行 | 完整UI组件 |
|
||||
| `TransformDialog.tsx` | ~30行 | Tab集成 |
|
||||
| **总计** | **~1460行** | **完整功能实现** |
|
||||
|
||||
---
|
||||
|
||||
### 🚀 Day 7-8 NA处理优化 + Pivot列顺序优化(2025-12-09~10)
|
||||
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
# DC数据清洗整理模块 - 当前状态与开发指南
|
||||
|
||||
> **文档版本:** v3.2
|
||||
> **文档版本:** v3.3
|
||||
> **创建日期:** 2025-11-28
|
||||
> **维护者:** DC模块开发团队
|
||||
> **最后更新:** 2025-12-13 🏆 **Postgres-Only 架构改造完成!**
|
||||
> **重大里程碑:** Tool C MVP完成 + Tool B Postgres-Only架构改造(智能双模式、任务拆分、断点续传)
|
||||
> **最后更新:** 2025-12-21 ✨ **多指标转换功能上线!**
|
||||
> **重大里程碑:** Tool C MVP完成 + Tool B Postgres-Only架构改造 + **Tool C多指标转换(方向1+2)**
|
||||
> **文档目的:** 反映模块真实状态,记录开发历程
|
||||
|
||||
---
|
||||
@@ -67,17 +67,18 @@ DC数据清洗整理模块提供4个智能工具,帮助研究人员清洗、
|
||||
- ✅ 断点续传支持(支持长时间提取任务)
|
||||
- ✅ Platform层统一管理(job.data存储)
|
||||
- ✅ Worker注册(extractionWorker.ts)
|
||||
- ✅ **Tool C 完整实现**(2025-12-06 ~ 2025-12-10):
|
||||
- ✅ Python微服务(~1800行,Day 1 + NA处理优化 + 全量数据处理)
|
||||
- ✅ Node.js后端(~3500行,Day 2-3,Day 5-8增强 + 全量返回)
|
||||
- ✅ 前端界面(~4000行,Day 4-8,筛选/行号/滚动条/全量加载)
|
||||
- ✅ **Tool C 完整实现**(2025-12-06 ~ 2025-12-21):
|
||||
- ✅ Python微服务(~2400行,Day 1 + NA处理优化 + 全量数据处理 + 多指标转换)
|
||||
- ✅ Node.js后端(~3600行,Day 2-3,Day 5-8增强 + 全量返回 + 多指标转换)
|
||||
- ✅ 前端界面(~4500行,Day 4-8,筛选/行号/滚动条/全量加载 + 多指标转换)
|
||||
- ✅ **通用 Chat 组件**(~968行,Day 5)🎉
|
||||
- ✅ 7个功能按钮(Day 6)
|
||||
- ✅ NA处理优化(4个功能,Day 7)
|
||||
- ✅ Pivot列顺序优化(Day 7-8)
|
||||
- ✅ 计算列方案B(安全列名映射,Day 7-8)
|
||||
- ✅ **UX重大改进**(列头筛选/行号/滚动条修复/全量数据,Day 8)
|
||||
- **总计:~13068行** | **完成度:98%**
|
||||
- ✅ **多指标转换**(方向1+2,智能分组,原始顺序保持,Day 9)
|
||||
- **总计:~14528行** | **完成度:99%**
|
||||
- **重大成就**:
|
||||
- 🎉 **前端通用能力层建设完成**
|
||||
- ✨ 基于 Ant Design X 的 Chat 组件库
|
||||
|
||||
@@ -539,3 +539,9 @@ df['creatinine'] = pd.to_numeric(df['creatinine'], errors='coerce')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -377,3 +377,9 @@ npm run dev
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -954,3 +954,9 @@ export const aiController = new AIController();
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1288,3 +1288,9 @@ npm install react-markdown
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -196,3 +196,9 @@ FMA___基线 | FMA___1个月 | FMA___2个月
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -354,3 +354,9 @@ formula = "FMA总分(0-100) / 100"
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -188,3 +188,9 @@ async handleFillnaMice(request, reply) {
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -160,3 +160,9 @@ method: 'mean' | 'median' | 'mode' | 'constant' | 'ffill' | 'bfill'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -307,6 +307,12 @@ Changes:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -382,3 +382,9 @@ cd path; command
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -611,3 +611,9 @@ import { logger } from '../../../../common/logging/index.js';
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -615,3 +615,9 @@ Content-Length: 45234
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -267,3 +267,9 @@ Response:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -420,3 +420,9 @@ Response:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -414,3 +414,9 @@ import { ChatContainer } from '@/shared/components/Chat';
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -324,3 +324,9 @@ const initialMessages = defaultMessages.length > 0 ? defaultMessages : [{
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -364,3 +364,9 @@ python main.py
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -612,3 +612,9 @@ http://localhost:5173/data-cleaning/tool-c
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -222,3 +222,9 @@ Day 5 (6-8小时):
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -393,6 +393,12 @@ Docs: docs/03-业务模块/DC-数据清洗整理/06-开发记录/DC模块重建
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -372,6 +372,12 @@ const mockAssets: Asset[] = [
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -357,5 +357,11 @@ frontend-v2/src/modules/dc/
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -316,6 +316,12 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -271,5 +271,11 @@ ConflictDetectionService // 冲突检测(字段级对比)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -320,5 +320,11 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -283,5 +283,11 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -347,5 +347,11 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -434,6 +434,12 @@ Tool B后端代码**100%复用**了平台通用能力层,无任何重复开发
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -281,5 +281,11 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -211,6 +211,12 @@ $ node scripts/check-dc-tables.mjs
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -445,5 +445,11 @@ ${fields.map((f, i) => `${i + 1}. ${f.name}:${f.desc}`).join('\n')}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
# 部署架构设计
|
||||
|
||||
> **文档版本:** v1.0
|
||||
> **创建日期:** 2025-10-29
|
||||
> **维护者:** 架构团队
|
||||
> **最后更新:** 2025-10-29
|
||||
|
||||
---
|
||||
|
||||
## 📋 文档说明
|
||||
|
||||
本文档描述系统的部署架构设计,包括:
|
||||
- 部署模式(云部署、本地化部署、混合部署)
|
||||
- 部署方案(Docker、Kubernetes等)
|
||||
- 环境配置
|
||||
- 模块独立部署方案
|
||||
|
||||
---
|
||||
|
||||
## ⏳ 待完善
|
||||
|
||||
本文档内容待规划完善,目前仅作为占位文档。
|
||||
|
||||
---
|
||||
|
||||
**文档版本:** v1.0
|
||||
**最后更新:** 2025-10-29
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -854,3 +854,9 @@ ACR镜像仓库:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -168,7 +168,7 @@ RAG 系统迁移的复杂度:
|
||||
专有网络 VPC: 选择 SAE 所在的 VPC
|
||||
安全组: 创建新安全组,配置入方向规则(⚠️ 安全红线):
|
||||
✅ 允许 22/TCP 来源:您的办公室公网IP # SSH管理
|
||||
✅ 允许 80/TCP 来源:172.16.0.0/12 # Nginx(VPC内网访问)
|
||||
✅ 允许 80/TCP 来源:172.17.0.0/16 # Nginx(VPC内网访问)
|
||||
❌ 拒绝 5000/TCP 来源:0.0.0.0/0 # Dify API禁止公网访问
|
||||
❌ 拒绝 6379/TCP 来源:0.0.0.0/0 # Redis禁止公网访问
|
||||
❌ 拒绝 8080/TCP 来源:0.0.0.0/0 # Weaviate禁止公网访问
|
||||
@@ -177,7 +177,7 @@ RAG 系统迁移的复杂度:
|
||||
|
||||
⚠️ 安全警告:
|
||||
- Dify API (5000)、Redis (6379)、Weaviate (8080) 绝对不能对公网开放
|
||||
- 只允许VPC内网访问(172.16.0.0/12)
|
||||
- 只允许VPC内网访问(172.17.0.0/16)
|
||||
- 端口绑定到 127.0.0.1(见docker-compose.yaml配置)
|
||||
```
|
||||
|
||||
|
||||
@@ -540,19 +540,19 @@ docker rm extraction-test
|
||||
```bash
|
||||
# 1. 登录阿里云容器镜像服务
|
||||
# 获取登录命令:阿里云控制台 → 容器镜像服务 → 访问凭证 → 设置Registry登录密码
|
||||
docker login --username=<your-username> registry.cn-hangzhou.aliyuncs.com
|
||||
docker login --username=<your-username> registry.cn-beijing.aliyuncs.com
|
||||
|
||||
# 2. 给镜像打标签
|
||||
docker tag extraction-service:latest \
|
||||
registry.cn-hangzhou.aliyuncs.com/clinical-research/extraction-service:v1.0
|
||||
registry.cn-beijing.aliyuncs.com/clinical-research/extraction-service:v1.0
|
||||
|
||||
# 3. 推送到阿里云
|
||||
docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/extraction-service:v1.0
|
||||
docker push registry.cn-beijing.aliyuncs.com/clinical-research/extraction-service:v1.0
|
||||
|
||||
# 4. 推送 latest 标签(便于后续更新)
|
||||
docker tag extraction-service:latest \
|
||||
registry.cn-hangzhou.aliyuncs.com/clinical-research/extraction-service:latest
|
||||
docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/extraction-service:latest
|
||||
registry.cn-beijing.aliyuncs.com/clinical-research/extraction-service:latest
|
||||
docker push registry.cn-beijing.aliyuncs.com/clinical-research/extraction-service:latest
|
||||
```
|
||||
|
||||
---
|
||||
@@ -572,7 +572,7 @@ docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/extraction-servi
|
||||
|
||||
3. **镜像配置**:
|
||||
```
|
||||
镜像地址: registry.cn-hangzhou.aliyuncs.com/clinical-research/extraction-service:latest
|
||||
镜像地址: registry.cn-beijing.aliyuncs.com/clinical-research/extraction-service:latest
|
||||
镜像版本: latest
|
||||
镜像拉取策略: Always(每次部署都拉取最新镜像)
|
||||
```
|
||||
@@ -690,7 +690,7 @@ TZ=Asia/Shanghai
|
||||
3. **查看并复制"内网访问地址"**,通常是以下格式之一:
|
||||
```
|
||||
# 格式 1: 内网 IP + 端口(⭐⭐⭐⭐⭐ 强烈推荐,最稳定)
|
||||
172.16.0.10:8000
|
||||
172.17.x.x:8000
|
||||
|
||||
# 格式 2: SAE 内网 Service 域名(需要额外配置服务发现,不推荐)
|
||||
extraction-service-xxxxx.cn-hangzhou.sae.aliyuncs.com:8000
|
||||
@@ -716,7 +716,7 @@ TZ=Asia/Shanghai
|
||||
5. **✅ 推荐做法(按优先级排序)**:
|
||||
```bash
|
||||
# ⭐⭐⭐⭐⭐ 方案A:直接使用内网IP(强烈推荐)
|
||||
EXTRACTION_SERVICE_URL=http://172.16.0.10:8000
|
||||
EXTRACTION_SERVICE_URL=http://172.17.x.x:8000
|
||||
# 获取方式:SAE控制台 > Python应用 > 实例列表 > 查看内网IP
|
||||
|
||||
# ⭐⭐⭐ 方案B:使用SAE服务发现(需要额外配置,不推荐初期使用)
|
||||
@@ -730,7 +730,7 @@ TZ=Asia/Shanghai
|
||||
|
||||
```bash
|
||||
# ⚠️ 使用 SAE 控制台显示的真实内网地址
|
||||
EXTRACTION_SERVICE_URL=http://172.16.0.10:8000
|
||||
EXTRACTION_SERVICE_URL=http://172.17.x.x:8000
|
||||
|
||||
# 注意:
|
||||
# 1. 不要使用猜测的域名
|
||||
@@ -817,7 +817,7 @@ export async function testExtractionService() {
|
||||
|
||||
2. **查看 Node.js 后端日志**(SAE 控制台 → 后端应用 → 日志):
|
||||
```
|
||||
[INFO] Calling extraction service: http://172.16.0.10:8000/extract/pdf
|
||||
[INFO] Calling extraction service: http://172.17.x.x:8000/extract/pdf
|
||||
[INFO] Extraction completed in 2.3s
|
||||
[INFO] Extracted text preview: "This is a test document..."
|
||||
```
|
||||
@@ -1050,7 +1050,7 @@ pip list --outdated
|
||||
|
||||
# 2. 重建镜像(包含安全更新)
|
||||
docker build -t extraction-service:v1.1 .
|
||||
docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/extraction-service:v1.1
|
||||
docker push registry.cn-beijing.aliyuncs.com/clinical-research/extraction-service:v1.1
|
||||
|
||||
# 3. 在 SAE 中更新镜像版本
|
||||
```
|
||||
@@ -1131,7 +1131,7 @@ with open(pdf_path, 'rb') as f:
|
||||
```
|
||||
后端日志:Connection refused
|
||||
或
|
||||
ECONNREFUSED: connect ECONNREFUSED 172.16.0.10:8000
|
||||
ECONNREFUSED: connect ECONNREFUSED 172.17.x.x:8000
|
||||
或
|
||||
Error: getaddrinfo ENOTFOUND extraction-service.internal
|
||||
```
|
||||
@@ -1144,7 +1144,7 @@ Error: getaddrinfo ENOTFOUND extraction-service.internal
|
||||
EXTRACTION_SERVICE_URL=http://extraction-service.internal:8000
|
||||
|
||||
# ✅ 正确配置(SAE 控制台显示的真实地址)
|
||||
EXTRACTION_SERVICE_URL=http://172.16.0.10:8000
|
||||
EXTRACTION_SERVICE_URL=http://172.17.x.x:8000
|
||||
```
|
||||
|
||||
**解决方法**:
|
||||
@@ -1300,7 +1300,7 @@ EXTRACTION_SERVICE_URL=http://extraction-service:8000
|
||||
# ✅ 正确做法:从 SAE 控制台获取真实地址
|
||||
# SAE 控制台 → extraction-service 应用 → 应用访问配置
|
||||
# 复制显示的"VPC 内网访问地址"
|
||||
EXTRACTION_SERVICE_URL=http://172.16.0.10:8000
|
||||
EXTRACTION_SERVICE_URL=http://172.17.x.x:8000
|
||||
```
|
||||
|
||||
**原因**:
|
||||
@@ -1498,7 +1498,7 @@ echo "Done!"
|
||||
docker build -t extraction-service:v1.0 .
|
||||
|
||||
# 推送镜像
|
||||
docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/extraction-service:v1.0
|
||||
docker push registry.cn-beijing.aliyuncs.com/clinical-research/extraction-service:v1.0
|
||||
|
||||
# 查看 SAE 日志
|
||||
# SAE 控制台 → 应用详情 → 日志
|
||||
|
||||
@@ -93,7 +93,7 @@ npm --version
|
||||
- [ ] **RDS PostgreSQL 15** 实例已创建并运行
|
||||
- 数据库名称:`ai_clinical`(或自定义)
|
||||
- 用户名和密码已准备
|
||||
- 内网地址已获取(如 `rm-xxxxx.pg.rds.aliyuncs.com:5432`)
|
||||
- 内网地址已获取(如 `pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com:5432`)
|
||||
- 白名单已配置(允许 SAE VPC 访问)
|
||||
|
||||
- [ ] **阿里云容器镜像服务 ACR** 已开通
|
||||
@@ -104,8 +104,8 @@ npm --version
|
||||
- VPC 和交换机已选择(与 RDS 在同一 VPC)
|
||||
|
||||
- [ ] **依赖服务的内网地址已获取**:
|
||||
- Python 微服务(SAE):`http://172.16.0.10:8000`
|
||||
- Dify 服务(ECS):`http://172.16.0.20:80`
|
||||
- Python 微服务(SAE):`http://172.17.x.x:8000`
|
||||
- Dify 服务(ECS):`http://172.17.x.x:80`
|
||||
|
||||
#### 敏感信息准备
|
||||
|
||||
@@ -113,7 +113,7 @@ npm --version
|
||||
|
||||
```bash
|
||||
# 数据库
|
||||
DATABASE_URL=postgresql://username:password@rm-xxxxx.pg.rds.aliyuncs.com:5432/ai_clinical?connection_limit=18&pool_timeout=10
|
||||
DATABASE_URL=postgresql://username:password@pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com:5432/ai_clinical?connection_limit=18&pool_timeout=10
|
||||
|
||||
# LLM API Keys(至少配置一个)
|
||||
DEEPSEEK_API_KEY=sk-xxxxx
|
||||
@@ -122,10 +122,10 @@ CLOSEAI_API_KEY=sk-xxxxx
|
||||
|
||||
# Dify
|
||||
DIFY_API_KEY=app-xxxxx
|
||||
DIFY_API_URL=http://172.16.0.20:80/v1
|
||||
DIFY_API_URL=http://172.17.x.x:80/v1
|
||||
|
||||
# 阿里云 OSS
|
||||
OSS_REGION=oss-cn-hangzhou
|
||||
OSS_REGION=oss-cn-beijing
|
||||
OSS_BUCKET=clinical-research-files
|
||||
OSS_ACCESS_KEY_ID=LTAI5t...
|
||||
OSS_ACCESS_KEY_SECRET=xxx...
|
||||
@@ -157,10 +157,10 @@ Node.js 后端(SAE)
|
||||
├──→ RDS PostgreSQL 15(数据库)
|
||||
│
|
||||
├──→ Python 微服务(SAE) - 文档提取
|
||||
│ └─ http://172.16.0.10:8000
|
||||
│ └─ http://172.17.x.x:8000
|
||||
│
|
||||
├──→ Dify 服务(ECS) - RAG 知识库
|
||||
│ └─ http://172.16.0.20:80/v1
|
||||
│ └─ http://172.17.x.x:80/v1
|
||||
│
|
||||
└──→ 阿里云 OSS - 文件存储
|
||||
└─ clinical-research-files
|
||||
@@ -247,7 +247,7 @@ cp backend/.env backend/.env.backup
|
||||
|
||||
# 2. 创建临时 RDS 连接配置
|
||||
cat > backend/.env.rds <<EOF
|
||||
DATABASE_URL="postgresql://username:password@rm-xxxxx.pg.rds.aliyuncs.com:5432/ai_clinical?connection_limit=18&pool_timeout=10"
|
||||
DATABASE_URL="postgresql://username:password@pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com:5432/ai_clinical?connection_limit=18&pool_timeout=10"
|
||||
EOF
|
||||
|
||||
# 3. 使用 RDS 配置
|
||||
@@ -263,7 +263,7 @@ npx prisma db pull
|
||||
|
||||
# 输出示例:
|
||||
# Prisma schema loaded from prisma/schema.prisma
|
||||
# Datasource "db": PostgreSQL database "ai_clinical" at "rm-xxxxx.pg.rds.aliyuncs.com:5432"
|
||||
# Datasource "db": PostgreSQL database "ai_clinical" at "pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com:5432"
|
||||
#
|
||||
# Introspecting based on datasource defined in prisma/schema.prisma …
|
||||
#
|
||||
@@ -348,7 +348,7 @@ git push
|
||||
|
||||
```bash
|
||||
# 错误信息:
|
||||
Error: P1001: Can't reach database server at `rm-xxxxx.pg.rds.aliyuncs.com:5432`
|
||||
Error: P1001: Can't reach database server at `pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com:5432`
|
||||
```
|
||||
|
||||
**解决方法**:
|
||||
@@ -359,7 +359,7 @@ Error: P1001: Can't reach database server at `rm-xxxxx.pg.rds.aliyuncs.com:5432`
|
||||
# 添加你的本地公网 IP(查询:curl ipinfo.io)
|
||||
|
||||
# 2. 测试连接
|
||||
psql "postgresql://username:password@rm-xxxxx.pg.rds.aliyuncs.com:5432/ai_clinical"
|
||||
psql "postgresql://username:password@pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com:5432/ai_clinical"
|
||||
|
||||
# 如果能连上,再执行 npx prisma db pull
|
||||
```
|
||||
@@ -885,7 +885,7 @@ SERVICE_NAME=backend-service
|
||||
|
||||
```bash
|
||||
# ⚠️ 使用 RDS 内网地址
|
||||
DATABASE_URL=postgresql://username:password@rm-xxxxx.pg.rds.aliyuncs.com:5432/ai_clinical?connection_limit=18&pool_timeout=10
|
||||
DATABASE_URL=postgresql://username:password@pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com:5432/ai_clinical?connection_limit=18&pool_timeout=10
|
||||
|
||||
# 连接池配置(根据 RDS 规格调整)
|
||||
DB_MAX_CONNECTIONS=400
|
||||
@@ -903,7 +903,7 @@ MAX_INSTANCES=20
|
||||
```bash
|
||||
# 使用阿里云 OSS
|
||||
STORAGE_TYPE=oss
|
||||
OSS_REGION=oss-cn-hangzhou
|
||||
OSS_REGION=oss-cn-beijing
|
||||
OSS_BUCKET=clinical-research-files
|
||||
OSS_ACCESS_KEY_ID=LTAI5t...
|
||||
OSS_ACCESS_KEY_SECRET=xxx...
|
||||
@@ -939,7 +939,7 @@ CLOSEAI_CLAUDE_BASE_URL=https://api.openai-proxy.org/anthropic
|
||||
|
||||
```bash
|
||||
# ⚠️ 使用 ECS 内网 IP(不要使用公网域名)
|
||||
DIFY_API_URL=http://172.16.0.20:80/v1
|
||||
DIFY_API_URL=http://172.17.x.x:80/v1
|
||||
DIFY_API_KEY=app-xxxxx
|
||||
```
|
||||
|
||||
@@ -1079,7 +1079,7 @@ ls -lh ai_clinical_backup_*.sql
|
||||
|
||||
```bash
|
||||
# 连接到 RDS 并导入
|
||||
psql "postgresql://username:password@rm-xxxxx.pg.rds.aliyuncs.com:5432/ai_clinical" \
|
||||
psql "postgresql://username:password@pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com:5432/ai_clinical" \
|
||||
< ai_clinical_backup_20251213.sql
|
||||
|
||||
# 输出示例:
|
||||
@@ -1101,7 +1101,7 @@ psql "postgresql://username:password@rm-xxxxx.pg.rds.aliyuncs.com:5432/ai_clinic
|
||||
|
||||
```bash
|
||||
# 连接到 RDS
|
||||
psql "postgresql://username:password@rm-xxxxx.pg.rds.aliyuncs.com:5432/ai_clinical"
|
||||
psql "postgresql://username:password@pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com:5432/ai_clinical"
|
||||
|
||||
# 查看所有 Schema
|
||||
\dn
|
||||
@@ -1160,7 +1160,7 @@ ls -l prisma/schema.prisma
|
||||
|
||||
```bash
|
||||
# 1. 连接到 RDS
|
||||
export DATABASE_URL="postgresql://username:password@rm-xxxxx.pg.rds.aliyuncs.com:5432/ai_clinical"
|
||||
export DATABASE_URL="postgresql://username:password@pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com:5432/ai_clinical"
|
||||
|
||||
# 2. 执行迁移
|
||||
cd backend
|
||||
@@ -1319,7 +1319,7 @@ curl -X POST https://backend-service-xxxxx.cn-hangzhou.sae.aliyuncs.com/api/asl/
|
||||
|
||||
# 查看后端日志(SAE 控制台 → 应用详情 → 日志)
|
||||
# 应该看到:
|
||||
# [INFO] Calling extraction service: http://172.16.0.10:8000/extract/pdf
|
||||
# [INFO] Calling extraction service: http://172.17.x.x:8000/extract/pdf
|
||||
# [INFO] Extraction completed in 3.2s
|
||||
```
|
||||
|
||||
@@ -1337,7 +1337,7 @@ curl -X POST https://backend-service-xxxxx.cn-hangzhou.sae.aliyuncs.com/api/know
|
||||
|
||||
# 查看后端日志
|
||||
# 应该看到:
|
||||
# [INFO] Calling Dify API: http://172.16.0.20:80/v1/chat-messages
|
||||
# [INFO] Calling Dify API: http://172.17.x.x:80/v1/chat-messages
|
||||
# [INFO] Dify response received in 2.5s
|
||||
```
|
||||
|
||||
@@ -1512,14 +1512,14 @@ SAE 控制台显示:实例启动中 → 健康检查失败 → 实例停止
|
||||
# 解决:检查 SAE 环境变量配置,补充缺失的变量
|
||||
|
||||
# 错误 B:数据库连接失败
|
||||
# 日志:❌ 数据库连接失败: getaddrinfo ENOTFOUND rm-xxxxx.pg.rds.aliyuncs.com
|
||||
# 日志:❌ 数据库连接失败: getaddrinfo ENOTFOUND pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com
|
||||
# 解决:
|
||||
# - 检查 DATABASE_URL 是否正确
|
||||
# - 检查 RDS 白名单是否允许 SAE VPC 访问
|
||||
# - 检查 RDS 内网地址是否可达
|
||||
|
||||
# 错误 C:Prisma 迁移未执行
|
||||
# 日志:Error: P1001: Can't reach database server at `rm-xxxxx.pg.rds.aliyuncs.com`
|
||||
# 日志:Error: P1001: Can't reach database server at `pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com`
|
||||
# 解决:先执行数据库迁移(参见第 8 节)
|
||||
|
||||
# 错误 D:端口冲突
|
||||
@@ -1607,7 +1607,7 @@ DATABASE_URL=postgresql://...?connection_limit=10&pool_timeout=10
|
||||
```
|
||||
[ERROR] Failed to connect to Python service: ECONNREFUSED
|
||||
或
|
||||
[ERROR] connect ETIMEDOUT 172.16.0.10:8000
|
||||
[ERROR] connect ETIMEDOUT 172.17.x.x:8000
|
||||
```
|
||||
|
||||
**排查步骤**:
|
||||
@@ -1622,7 +1622,7 @@ DATABASE_URL=postgresql://...?connection_limit=10&pool_timeout=10
|
||||
|
||||
# 3. 测试内网连通性
|
||||
# 在后端应用的 Webshell 中执行:
|
||||
curl -v http://172.16.0.10:8000/health
|
||||
curl -v http://172.17.x.x:8000/health
|
||||
|
||||
# 4. 检查安全组规则
|
||||
# SAE 控制台 → extraction-service 应用 → 网络配置
|
||||
@@ -1666,7 +1666,7 @@ curl http://localhost/v1/info
|
||||
|
||||
# 3. 从 SAE 测试连通性
|
||||
# 在后端应用的 Webshell 中执行:
|
||||
curl -v http://172.16.0.20:80/v1/info
|
||||
curl -v http://172.17.x.x:80/v1/info
|
||||
```
|
||||
|
||||
**解决方法**:
|
||||
@@ -1700,7 +1700,7 @@ DIFY_API_URL=http://<ECS内网IP>:80/v1
|
||||
# 1. 检查环境变量
|
||||
# SAE 控制台 → 应用详情 → 环境变量
|
||||
# 确认以下变量正确:
|
||||
OSS_REGION=oss-cn-hangzhou
|
||||
OSS_REGION=oss-cn-beijing
|
||||
OSS_BUCKET=clinical-research-files
|
||||
OSS_ACCESS_KEY_ID=LTAI5t...
|
||||
OSS_ACCESS_KEY_SECRET=xxx...
|
||||
@@ -1825,7 +1825,7 @@ PrismaClientKnownRequestError: column "phone" does not exist
|
||||
|
||||
```bash
|
||||
# 1. 在本地开发环境,连接到 RDS
|
||||
export DATABASE_URL="postgresql://username:password@rm-xxxxx.pg.rds.aliyuncs.com:5432/ai_clinical"
|
||||
export DATABASE_URL="postgresql://username:password@pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com:5432/ai_clinical"
|
||||
|
||||
# 2. 反向同步 Schema
|
||||
npx prisma db pull
|
||||
@@ -2006,7 +2006,7 @@ pg_dump → 导入 RDS → prisma db pull(同步)→ 构建镜像 → 部署
|
||||
|
||||
```typescript
|
||||
// ❌ 错误示例
|
||||
const dbUrl = 'postgresql://admin:P@ssw0rd@rm-xxxxx.pg.rds.aliyuncs.com:5432/ai_clinical';
|
||||
const dbUrl = 'postgresql://admin:P@ssw0rd@pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com:5432/ai_clinical';
|
||||
|
||||
// ✅ 正确做法
|
||||
const dbUrl = process.env.DATABASE_URL;
|
||||
|
||||
@@ -106,7 +106,7 @@ npm run build
|
||||
#### 阿里云资源
|
||||
|
||||
- [ ] **后端服务(SAE)** 已部署并运行
|
||||
- 后端 VPC 内网地址已获取(如 `http://172.16.0.30:3001`)
|
||||
- 后端 VPC 内网地址已获取(如 `http://172.17.x.x:3001`)
|
||||
- 后端健康检查可访问
|
||||
|
||||
- [ ] **阿里云容器镜像服务 ACR** 已开通
|
||||
@@ -120,7 +120,7 @@ npm run build
|
||||
|
||||
```bash
|
||||
# 后端服务内网地址(关键)
|
||||
BACKEND_SERVICE_URL=http://172.16.0.30:3001
|
||||
BACKEND_SERVICE_URL=http://172.17.x.x:3001
|
||||
|
||||
# 如果需要配置环境变量(可选)
|
||||
# VITE_API_BASE_URL 在构建时注入(很少使用)
|
||||
@@ -217,7 +217,7 @@ ASL 模块:GET /api/v1/asl/projects
|
||||
↓
|
||||
Nginx 反向代理
|
||||
↓
|
||||
后端服务:http://172.16.0.30:3001/api/v1/asl/projects
|
||||
后端服务:http://172.17.x.x:3001/api/v1/asl/projects
|
||||
```
|
||||
|
||||
### 📝 构建流程
|
||||
@@ -363,7 +363,7 @@ http {
|
||||
server ${BACKEND_SERVICE_HOST}:${BACKEND_SERVICE_PORT} fail_timeout=30s max_fails=3;
|
||||
|
||||
# 如果有多个后端实例(负载均衡)
|
||||
# server 172.16.0.30:3001 weight=1;
|
||||
# server 172.17.x.x:3001 weight=1;
|
||||
# server 172.16.0.31:3001 weight=1;
|
||||
|
||||
keepalive 32; # 保持连接池
|
||||
@@ -485,7 +485,7 @@ http {
|
||||
access_log off;
|
||||
# 仅允许内网访问
|
||||
allow 10.0.0.0/8;
|
||||
allow 172.16.0.0/12;
|
||||
allow 172.17.0.0/16;
|
||||
allow 192.168.0.0/16;
|
||||
deny all;
|
||||
}
|
||||
@@ -547,7 +547,7 @@ Nginx:接收请求
|
||||
↓
|
||||
Nginx:proxy_pass http://backend
|
||||
↓
|
||||
后端服务:http://172.16.0.30:3001/api/v1/projects
|
||||
后端服务:http://172.17.x.x:3001/api/v1/projects
|
||||
↓
|
||||
后端返回数据
|
||||
↓
|
||||
@@ -806,7 +806,7 @@ nginx.conf.template(模板):
|
||||
↓ envsubst 替换
|
||||
|
||||
nginx.conf(最终配置):
|
||||
server 172.16.0.30:3001;
|
||||
server 172.17.x.x:3001;
|
||||
```
|
||||
|
||||
#### 3. 健康检查
|
||||
@@ -987,7 +987,7 @@ docker rm frontend-test
|
||||
|
||||
```bash
|
||||
# 登录(使用 ACR 密码,不是阿里云账号密码)
|
||||
docker login --username=your-aliyun-account registry.cn-hangzhou.aliyuncs.com
|
||||
docker login --username=your-aliyun-account registry.cn-beijing.aliyuncs.com
|
||||
|
||||
# 输入密码后看到:
|
||||
# Login Succeeded
|
||||
@@ -998,21 +998,21 @@ docker login --username=your-aliyun-account registry.cn-hangzhou.aliyuncs.com
|
||||
```bash
|
||||
# 格式:registry地址/命名空间/仓库名:版本号
|
||||
docker tag frontend-service:v1.0.0 \
|
||||
registry.cn-hangzhou.aliyuncs.com/clinical-research/frontend-service:v1.0.0
|
||||
registry.cn-beijing.aliyuncs.com/clinical-research/frontend-service:v1.0.0
|
||||
|
||||
# 同时打一个 latest 标签
|
||||
docker tag frontend-service:v1.0.0 \
|
||||
registry.cn-hangzhou.aliyuncs.com/clinical-research/frontend-service:latest
|
||||
registry.cn-beijing.aliyuncs.com/clinical-research/frontend-service:latest
|
||||
```
|
||||
|
||||
### 步骤 3:推送镜像
|
||||
|
||||
```bash
|
||||
# 推送指定版本
|
||||
docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/frontend-service:v1.0.0
|
||||
docker push registry.cn-beijing.aliyuncs.com/clinical-research/frontend-service:v1.0.0
|
||||
|
||||
# 推送 latest
|
||||
docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/frontend-service:latest
|
||||
docker push registry.cn-beijing.aliyuncs.com/clinical-research/frontend-service:latest
|
||||
|
||||
# 推送过程需要 1-3 分钟(镜像很小)
|
||||
```
|
||||
@@ -1055,7 +1055,7 @@ docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/frontend-service
|
||||
| 配置项 | 值 |
|
||||
|-------|-----|
|
||||
| **镜像类型** | 容器镜像服务企业版实例 |
|
||||
| **镜像仓库** | `registry.cn-hangzhou.aliyuncs.com/clinical-research/frontend-service` |
|
||||
| **镜像仓库** | `registry.cn-beijing.aliyuncs.com/clinical-research/frontend-service` |
|
||||
| **镜像版本** | `v1.0.0` |
|
||||
| **镜像拉取策略** | 总是拉取镜像 |
|
||||
|
||||
@@ -1085,7 +1085,7 @@ docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/frontend-service
|
||||
|
||||
```bash
|
||||
# ⚠️ 必须配置(否则容器启动失败)
|
||||
BACKEND_SERVICE_HOST=172.16.0.30
|
||||
BACKEND_SERVICE_HOST=172.17.x.x
|
||||
|
||||
# 可选配置(默认 3001)
|
||||
BACKEND_SERVICE_PORT=3001
|
||||
@@ -1108,7 +1108,7 @@ upstream backend {
|
||||
server ${BACKEND_SERVICE_URL}; # ❌ 无法解析 http://172.16.0.30:3001
|
||||
|
||||
# 拆分后:
|
||||
server 172.16.0.30:3001; # ✅ 正确
|
||||
server 172.17.x.x:3001; # ✅ 正确
|
||||
```
|
||||
|
||||
### 步骤 5:配置健康检查
|
||||
@@ -1337,7 +1337,7 @@ API 代理:响应时间 50-500ms(取决于后端)
|
||||
# ✅ 正常启动
|
||||
============================================
|
||||
Starting Frontend Nginx Service
|
||||
Backend Service: 172.16.0.30:3001
|
||||
Backend Service: 172.17.x.x:3001
|
||||
============================================
|
||||
nginx: configuration file /etc/nginx/nginx.conf test is successful
|
||||
|
||||
@@ -1354,7 +1354,7 @@ nginx: configuration file /etc/nginx/nginx.conf test is successful
|
||||
# ❌ 错误日志(后端连接失败)
|
||||
2025/12/13 10:30:04 [error] 7#7: *1 connect() failed (111: Connection refused) while connecting to upstream
|
||||
client: 172.31.0.10, server: _, request: "GET /api/v1/projects HTTP/1.1"
|
||||
upstream: "http://172.16.0.30:3001/api/v1/projects"
|
||||
upstream: "http://172.17.x.x:3001/api/v1/projects"
|
||||
```
|
||||
|
||||
#### 3. Nginx 状态监控
|
||||
@@ -1419,7 +1419,7 @@ curl http://localhost/nginx_status
|
||||
cd frontend
|
||||
npm run build
|
||||
docker build -t frontend-service:v1.0.1 .
|
||||
docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/frontend-service:v1.0.1
|
||||
docker push registry.cn-beijing.aliyuncs.com/clinical-research/frontend-service:v1.0.1
|
||||
|
||||
# 2. 在 SAE 中更新镜像
|
||||
# SAE 控制台 → 应用详情 → 部署
|
||||
@@ -1556,7 +1556,7 @@ location / {
|
||||
|
||||
# 2. 测试后端内网地址是否可达
|
||||
# 登录前端应用的 Webshell:
|
||||
curl http://172.16.0.30:3001/api/v1/health
|
||||
curl http://172.17.x.x:3001/api/v1/health
|
||||
|
||||
# 如果返回错误,说明:
|
||||
# - 后端服务未启动
|
||||
@@ -1567,7 +1567,7 @@ curl http://172.16.0.30:3001/api/v1/health
|
||||
cat /etc/nginx/nginx.conf | grep -A 5 "upstream backend"
|
||||
|
||||
# 应该看到正确的后端地址:
|
||||
# server 172.16.0.30:3001 fail_timeout=30s max_fails=3;
|
||||
# server 172.17.x.x:3001 fail_timeout=30s max_fails=3;
|
||||
|
||||
# 4. 查看 Nginx 错误日志
|
||||
tail -f /var/log/nginx/error.log | grep "upstream"
|
||||
@@ -1579,14 +1579,14 @@ tail -f /var/log/nginx/error.log | grep "upstream"
|
||||
# 方法 1:更新环境变量
|
||||
# SAE 控制台 → frontend-service → 应用配置 → 环境变量
|
||||
# 确认:
|
||||
BACKEND_SERVICE_HOST=172.16.0.30 # 正确的内网 IP
|
||||
BACKEND_SERVICE_HOST=172.17.x.x # 正确的内网 IP
|
||||
BACKEND_SERVICE_PORT=3001
|
||||
|
||||
# 重启应用使环境变量生效
|
||||
|
||||
# 方法 2:测试内网连通性
|
||||
# 在前端 Webshell 中:
|
||||
telnet 172.16.0.30 3001
|
||||
telnet 172.17.x.x 3001
|
||||
# 如果连接失败,检查:
|
||||
# - 后端和前端是否在同一 VPC
|
||||
# - 安全组规则是否允许访问
|
||||
@@ -1744,7 +1744,7 @@ cat /docker-entrypoint.sh | grep "envsubst"
|
||||
cat /etc/nginx/nginx.conf | grep "server.*backend"
|
||||
|
||||
# 应该看到:
|
||||
# server 172.16.0.30:3001;
|
||||
# server 172.17.x.x:3001;
|
||||
|
||||
# 如果看到:
|
||||
# server ${BACKEND_SERVICE_HOST}:${BACKEND_SERVICE_PORT}; # ❌ 未替换
|
||||
@@ -1809,11 +1809,11 @@ export default defineConfig({
|
||||
|
||||
```bash
|
||||
# ✅ 正确做法:拆分 Host 和 Port
|
||||
BACKEND_SERVICE_HOST=172.16.0.30
|
||||
BACKEND_SERVICE_HOST=172.17.x.x
|
||||
BACKEND_SERVICE_PORT=3001
|
||||
|
||||
# ❌ 错误做法:完整 URL
|
||||
BACKEND_SERVICE_URL=http://172.16.0.30:3001
|
||||
BACKEND_SERVICE_URL=http://172.17.x.x:3001
|
||||
# Nginx 无法解析协议前缀
|
||||
```
|
||||
|
||||
|
||||
@@ -465,3 +465,9 @@ NAT网关成本¥100/月,对初创团队是一笔开销
|
||||
**审查依据:** 专业技术团队反馈
|
||||
**修正质量:** ⭐⭐⭐⭐⭐(8/8问题已全部修正)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -370,3 +370,9 @@ curl http://你的SAE地址:3001/health
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -702,3 +702,9 @@ const job = await queue.getJob(jobId);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -469,3 +469,9 @@ processLiteraturesInBackground(task.id, projectId, testLiteratures);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -946,3 +946,9 @@ ROI = (¥22,556 - ¥144) / ¥144 × 100% = 15,564%
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1003,3 +1003,9 @@ Redis 实例:¥500/月
|
||||
**下次更新:** Phase 8 完成后
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -461,3 +461,9 @@ import { ChatContainer } from '@/shared/components/Chat';
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -70,6 +70,17 @@ from operations.conditional import apply_conditional_column, apply_simple_binnin
|
||||
from operations.dropna import drop_missing_values, get_missing_summary
|
||||
from operations.compute import compute_column, get_formula_examples
|
||||
from operations.pivot import pivot_long_to_wide, get_pivot_preview
|
||||
from operations.unpivot import apply_unpivot, get_unpivot_preview # ✨ 新增:宽表转长表
|
||||
from operations.metric_time_transform import (
|
||||
apply_metric_time_transform,
|
||||
detect_common_pattern,
|
||||
preview_metric_time_transform,
|
||||
detect_metric_groups, # ✨ 多指标自动分组
|
||||
apply_multi_metric_to_long, # ✨ 多指标转长表(方向1)
|
||||
preview_multi_metric_to_long, # ✨ 多指标转换预览(方向1)
|
||||
apply_multi_metric_to_matrix, # ✨ 多指标转矩阵(方向2)
|
||||
preview_multi_metric_to_matrix # ✨ 多指标转换预览(方向2)
|
||||
)
|
||||
from operations.fillna import fillna_simple, fillna_mice, get_column_missing_stats
|
||||
|
||||
|
||||
@@ -149,6 +160,59 @@ class PivotRequest(BaseModel):
|
||||
pivot_value_order: List[str] = [] # ✨ 新增:透视列值的原始顺序
|
||||
|
||||
|
||||
class UnpivotRequest(BaseModel):
|
||||
"""Unpivot请求模型(宽表转长表)"""
|
||||
data: List[Dict[str, Any]]
|
||||
id_vars: List[str] # ID列(保持不变的列)
|
||||
value_vars: List[str] # 值列(需要转换的列)
|
||||
var_name: str = '变量' # 变量名列名
|
||||
value_name: str = '值' # 值列名
|
||||
parse_column_names: bool = False # 是否解析列名
|
||||
separator: str = '_' # 分隔符
|
||||
metric_name: Optional[str] = None # 指标列名
|
||||
time_name: Optional[str] = None # 时间列名
|
||||
dropna: bool = False # 是否删除缺失值行
|
||||
|
||||
|
||||
class MetricTimeTransformRequest(BaseModel):
|
||||
"""指标-时间表转换请求模型"""
|
||||
data: List[Dict[str, Any]]
|
||||
id_vars: List[str] # ID列(保持不变的列)
|
||||
value_vars: List[str] # 值列(同一指标的多个时间点)
|
||||
metric_name: Optional[str] = None # 指标名称(如果为None,则自动检测)
|
||||
separator: Optional[str] = None # 分隔符(如果为None,则自动检测)
|
||||
timepoint_col_name: str = '时间点' # 时间点列名
|
||||
|
||||
|
||||
class MetricTimeDetectRequest(BaseModel):
|
||||
"""指标-时间表模式检测请求模型"""
|
||||
value_vars: List[str] # 值列(用于检测模式)
|
||||
|
||||
|
||||
class MultiMetricDetectRequest(BaseModel):
|
||||
"""多指标分组检测请求模型"""
|
||||
value_vars: List[str] # 值列(用于检测分组)
|
||||
separators: Optional[List[str]] = None # 可选的分隔符列表
|
||||
|
||||
|
||||
class MultiMetricToLongRequest(BaseModel):
|
||||
"""多指标转长表请求模型(方向1)"""
|
||||
data: List[Dict[str, Any]]
|
||||
id_vars: List[str] # ID列
|
||||
value_vars: List[str] # 值列(多个指标的多个时间点)
|
||||
separators: Optional[List[str]] = None # 可选的分隔符列表
|
||||
event_col_name: str = 'Event_Name' # 时间点列名
|
||||
|
||||
|
||||
class MultiMetricToMatrixRequest(BaseModel):
|
||||
"""多指标转矩阵请求模型(方向2)"""
|
||||
data: List[Dict[str, Any]]
|
||||
id_vars: List[str] # ID列
|
||||
value_vars: List[str] # 值列(多个指标的多个时间点)
|
||||
separators: Optional[List[str]] = None # 可选的分隔符列表
|
||||
metric_col_name: str = '指标名' # 指标列名
|
||||
|
||||
|
||||
class FillnaStatsRequest(BaseModel):
|
||||
"""获取列缺失值统计请求模型"""
|
||||
data: List[Dict[str, Any]]
|
||||
@@ -1292,6 +1356,515 @@ async def operation_pivot(request: PivotRequest):
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
@app.post("/api/operations/unpivot")
|
||||
async def operation_unpivot(request: UnpivotRequest):
|
||||
"""
|
||||
Unpivot操作:宽表转长表(预写函数)
|
||||
|
||||
将横向数据转为纵向重复数据
|
||||
|
||||
典型医学场景:
|
||||
- 多时间点随访数据(FMA_基线、FMA_2周 → 时间点列 + FMA值列)
|
||||
- 多指标合并分析(收缩压、舒张压 → 指标列 + 测量值列)
|
||||
|
||||
Args:
|
||||
request: UnpivotRequest
|
||||
- data: 数据
|
||||
- id_vars: ID列(保持不变的列)
|
||||
- value_vars: 值列(需要转换的列)
|
||||
- var_name: 变量名列名(默认:"变量")
|
||||
- value_name: 值列名(默认:"值")
|
||||
- parse_column_names: 是否解析列名(默认:False)
|
||||
- separator: 分隔符(默认:"_")
|
||||
- metric_name: 指标列名(可选)
|
||||
- time_name: 时间列名(可选)
|
||||
- dropna: 是否删除缺失值行(默认:False)
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"result_data": List[Dict],
|
||||
"output": str,
|
||||
"execution_time": float,
|
||||
"result_shape": [rows, cols]
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import time
|
||||
import io
|
||||
import sys
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 捕获打印输出
|
||||
captured_output = io.StringIO()
|
||||
sys.stdout = captured_output
|
||||
|
||||
try:
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# ✨ 调用预写函数
|
||||
result_df = apply_unpivot(
|
||||
df,
|
||||
request.id_vars,
|
||||
request.value_vars,
|
||||
request.var_name,
|
||||
request.value_name,
|
||||
request.parse_column_names,
|
||||
request.separator,
|
||||
request.metric_name,
|
||||
request.time_name,
|
||||
request.dropna
|
||||
)
|
||||
|
||||
# 转换回JSON(处理NaN和inf值)
|
||||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||||
result_data = result_df_clean.to_dict('records')
|
||||
|
||||
# 恢复stdout
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"Unpivot成功: {len(request.id_vars)} ID列 × {len(request.value_vars)} 值列 → {len(result_data)} 行")
|
||||
|
||||
return JSONResponse(content={
|
||||
"success": True,
|
||||
"result_data": result_data,
|
||||
"output": output,
|
||||
"execution_time": execution_time,
|
||||
"result_shape": [len(result_data), len(result_df.columns)]
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
raise e
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Unpivot操作失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
@app.post("/api/operations/metric-time/detect")
|
||||
async def operation_metric_time_detect(request: MetricTimeDetectRequest):
|
||||
"""
|
||||
检测指标-时间表转换模式
|
||||
|
||||
自动分析列名,检测:
|
||||
- 公共前缀(指标名)
|
||||
- 分隔符
|
||||
- 时间点列表
|
||||
- 置信度
|
||||
|
||||
Args:
|
||||
request: MetricTimeDetectRequest
|
||||
- value_vars: 值列列表
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"pattern": {
|
||||
"common_prefix": str,
|
||||
"separator": str,
|
||||
"timepoints": List[str],
|
||||
"confidence": float,
|
||||
"message": str
|
||||
}
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import time
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
logger.info(f"检测指标-时间表模式: {len(request.value_vars)} 列")
|
||||
|
||||
# 调用检测函数
|
||||
pattern = detect_common_pattern(request.value_vars)
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"模式检测完成: confidence={pattern.get('confidence', 0):.2f}")
|
||||
|
||||
return JSONResponse(content={
|
||||
"success": pattern['success'],
|
||||
"pattern": pattern,
|
||||
"execution_time": execution_time
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"模式检测失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
@app.post("/api/operations/metric-time")
|
||||
async def operation_metric_time_transform(request: MetricTimeTransformRequest):
|
||||
"""
|
||||
指标-时间表转换操作(预写函数)
|
||||
|
||||
将多个时间点列转换为"指标行+时间点列"格式
|
||||
|
||||
典型场景:
|
||||
- 制作临床研究Table 1
|
||||
- 横向对比同一指标的时间变化
|
||||
|
||||
Args:
|
||||
request: MetricTimeTransformRequest
|
||||
- data: 数据
|
||||
- id_vars: ID列(保持不变)
|
||||
- value_vars: 值列(同一指标的多个时间点)
|
||||
- metric_name: 指标名称(可选,自动检测)
|
||||
- separator: 分隔符(可选,自动检测)
|
||||
- timepoint_col_name: 时间点列名
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"result_data": List[Dict],
|
||||
"output": str,
|
||||
"execution_time": float,
|
||||
"result_shape": [rows, cols]
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import time
|
||||
import io
|
||||
import sys
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 捕获打印输出
|
||||
captured_output = io.StringIO()
|
||||
sys.stdout = captured_output
|
||||
|
||||
try:
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# ✨ 调用预写函数
|
||||
result_df = apply_metric_time_transform(
|
||||
df,
|
||||
request.id_vars,
|
||||
request.value_vars,
|
||||
request.metric_name,
|
||||
request.separator,
|
||||
request.timepoint_col_name
|
||||
)
|
||||
|
||||
# 转换回JSON(处理NaN和inf值)
|
||||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||||
result_data = result_df_clean.to_dict('records')
|
||||
|
||||
# 恢复stdout
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"指标-时间表转换成功: {len(request.id_vars)} ID列 × {len(request.value_vars)} 值列 → {len(result_df.columns)} 列")
|
||||
|
||||
return JSONResponse(content={
|
||||
"success": True,
|
||||
"result_data": result_data,
|
||||
"output": output,
|
||||
"execution_time": execution_time,
|
||||
"result_shape": [len(result_data), len(result_df.columns)]
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
raise e
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"指标-时间表转换失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
# ==================== 多指标转换API ====================
|
||||
|
||||
@app.post("/api/operations/multi-metric/detect")
|
||||
async def operation_multi_metric_detect(request: MultiMetricDetectRequest):
|
||||
"""
|
||||
多指标自动分组检测
|
||||
|
||||
检测多个指标的列并自动分组
|
||||
|
||||
Args:
|
||||
request: MultiMetricDetectRequest
|
||||
- value_vars: 值列列表
|
||||
- separators: 可选的分隔符列表
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"metric_groups": Dict[str, List[str]], # 指标分组
|
||||
"separator": str, # 检测到的分隔符
|
||||
"timepoints": List[str], # 时间点列表
|
||||
"confidence": float, # 置信度
|
||||
"message": str
|
||||
}
|
||||
"""
|
||||
try:
|
||||
result = detect_metric_groups(
|
||||
request.value_vars,
|
||||
request.separators
|
||||
)
|
||||
|
||||
logger.info(f"多指标分组检测: {len(request.value_vars)} 列 → {len(result.get('metric_groups', {}))} 个指标")
|
||||
|
||||
return JSONResponse(content=result)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"多指标分组检测失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
@app.post("/api/operations/multi-metric/to-long")
|
||||
async def operation_multi_metric_to_long(request: MultiMetricToLongRequest):
|
||||
"""
|
||||
多指标转长表(时间点为行,指标为列)
|
||||
|
||||
将多个指标的宽表转换为长表格式,适合统计分析和可视化
|
||||
|
||||
典型场景:
|
||||
- 纵向研究数据分析
|
||||
- 重复测量数据准备
|
||||
- 混合效应模型、GEE分析
|
||||
- 数据可视化(ggplot2、seaborn)
|
||||
|
||||
Args:
|
||||
request: MultiMetricToLongRequest
|
||||
- data: 数据
|
||||
- id_vars: ID列
|
||||
- value_vars: 值列(多个指标的多个时间点)
|
||||
- separators: 可选的分隔符列表
|
||||
- event_col_name: 时间点列名
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"result_data": List[Dict],
|
||||
"grouping": {...}, # 分组信息
|
||||
"output": str,
|
||||
"execution_time": float,
|
||||
"result_shape": [rows, cols]
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import time
|
||||
import io
|
||||
import sys
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 捕获打印输出
|
||||
captured_output = io.StringIO()
|
||||
sys.stdout = captured_output
|
||||
|
||||
try:
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# 1. 先检测分组
|
||||
grouping = detect_metric_groups(
|
||||
request.value_vars,
|
||||
request.separators
|
||||
)
|
||||
|
||||
if not grouping['success']:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": grouping['message'],
|
||||
"output": output
|
||||
}, status_code=400)
|
||||
|
||||
# 2. 执行转换
|
||||
result_df = apply_multi_metric_to_long(
|
||||
df,
|
||||
request.id_vars,
|
||||
grouping['metric_groups'],
|
||||
grouping['separator'],
|
||||
request.event_col_name
|
||||
)
|
||||
|
||||
# 转换回JSON(处理NaN和inf值)
|
||||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||||
result_data = result_df_clean.to_dict('records')
|
||||
|
||||
# 恢复stdout
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"多指标转长表成功: {len(grouping['metric_groups'])} 指标 × {len(grouping['timepoints'])} 时间点 → {len(result_df)} 行")
|
||||
|
||||
return JSONResponse(content={
|
||||
"success": True,
|
||||
"result_data": result_data,
|
||||
"grouping": grouping,
|
||||
"output": output,
|
||||
"execution_time": execution_time,
|
||||
"result_shape": [len(result_data), len(result_df.columns)]
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
raise e
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"多指标转长表失败: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
@app.post("/api/operations/multi-metric/to-matrix")
|
||||
async def operation_multi_metric_to_matrix(request: MultiMetricToMatrixRequest):
|
||||
"""
|
||||
多指标转矩阵(时间点为列,指标为行)
|
||||
|
||||
将多个指标的宽表转换为矩阵格式,适合临床报告和数据审查
|
||||
|
||||
典型场景:
|
||||
- 临床研究报告
|
||||
- 数据审查表
|
||||
- CRF核对
|
||||
- 单受试者数据审查
|
||||
|
||||
Args:
|
||||
request: MultiMetricToMatrixRequest
|
||||
- data: 数据
|
||||
- id_vars: ID列
|
||||
- value_vars: 值列(多个指标的多个时间点)
|
||||
- separators: 可选的分隔符列表
|
||||
- metric_col_name: 指标列名
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"result_data": List[Dict],
|
||||
"grouping": {...}, # 分组信息
|
||||
"output": str,
|
||||
"execution_time": float,
|
||||
"result_shape": [rows, cols]
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import time
|
||||
import io
|
||||
import sys
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 捕获打印输出
|
||||
captured_output = io.StringIO()
|
||||
sys.stdout = captured_output
|
||||
|
||||
try:
|
||||
# 转换为DataFrame
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
# 1. 先检测分组
|
||||
grouping = detect_metric_groups(
|
||||
request.value_vars,
|
||||
request.separators
|
||||
)
|
||||
|
||||
if not grouping['success']:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": grouping['message'],
|
||||
"output": output
|
||||
}, status_code=400)
|
||||
|
||||
# 2. 执行转换
|
||||
result_df = apply_multi_metric_to_matrix(
|
||||
df,
|
||||
request.id_vars,
|
||||
grouping['metric_groups'],
|
||||
grouping['separator'],
|
||||
'Event_Name',
|
||||
request.metric_col_name
|
||||
)
|
||||
|
||||
# 转换回JSON(处理NaN和inf值)
|
||||
result_df = result_df.replace([np.inf, -np.inf], None)
|
||||
result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None})
|
||||
result_data = result_df_clean.to_dict('records')
|
||||
|
||||
# 恢复stdout
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
logger.info(f"多指标转矩阵成功: {len(grouping['metric_groups'])} 指标 × {len(grouping['timepoints'])} 时间点 → {len(result_df)} 行")
|
||||
|
||||
return JSONResponse(content={
|
||||
"success": True,
|
||||
"result_data": result_data,
|
||||
"grouping": grouping,
|
||||
"output": output,
|
||||
"execution_time": execution_time,
|
||||
"result_shape": [len(result_data), len(result_df.columns)]
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
sys.stdout = sys.__stdout__
|
||||
output = captured_output.getvalue()
|
||||
raise e
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"多指标转矩阵失败: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
@app.post("/api/operations/fillna-stats")
|
||||
async def operation_fillna_stats(request: FillnaStatsRequest):
|
||||
"""
|
||||
|
||||
@@ -24,3 +24,9 @@ __version__ = '1.0.0'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -157,3 +157,9 @@ def get_missing_summary(df: pd.DataFrame) -> dict:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -117,3 +117,9 @@ def apply_filter(
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
921
extraction_service/operations/metric_time_transform.py
Normal file
921
extraction_service/operations/metric_time_transform.py
Normal file
@@ -0,0 +1,921 @@
|
||||
"""
|
||||
指标-时间表转换(Metric-Time Transform)
|
||||
|
||||
将多个时间点列转换为"指标行+时间点列"格式
|
||||
典型医学场景:
|
||||
- 制作临床研究Table 1
|
||||
- 横向对比同一指标的时间变化
|
||||
- 多时间点随访数据整理
|
||||
|
||||
示例:
|
||||
输入(宽表):
|
||||
Record_ID | FMA___基线 | FMA___2周 | FMA___1月
|
||||
10 | 54 | 93 | 68
|
||||
11 | 16 | 31 | 72
|
||||
|
||||
输出(指标-时间表):
|
||||
Record_ID | 时间点 | 基线 | 2周 | 1月
|
||||
10 | FMA | 54 | 93 | 68
|
||||
11 | FMA | 16 | 31 | 72
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import List, Optional, Dict, Any
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
def detect_common_pattern(column_names: List[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
自动检测列名的公共模式(前缀、分隔符、时间点)
|
||||
|
||||
Args:
|
||||
column_names: 列名列表
|
||||
|
||||
Returns:
|
||||
{
|
||||
'success': bool,
|
||||
'common_prefix': str, # 公共前缀(指标名)
|
||||
'separator': str, # 分隔符
|
||||
'timepoints': List[str], # 时间点列表
|
||||
'confidence': float, # 置信度 0-1
|
||||
'message': str # 提示信息
|
||||
}
|
||||
|
||||
Examples:
|
||||
>>> cols = ['FMA总得分___筛选及基线', 'FMA总得分___随访(2周)', 'FMA总得分___随访(1个月)']
|
||||
>>> result = detect_common_pattern(cols)
|
||||
>>> result['common_prefix']
|
||||
'FMA总得分'
|
||||
>>> result['separator']
|
||||
'___'
|
||||
>>> result['timepoints']
|
||||
['筛选及基线', '随访(2周)', '随访(1个月)']
|
||||
"""
|
||||
print(f"\n🔍 开始自动检测列名模式...", flush=True)
|
||||
print(f" 输入列数: {len(column_names)}", flush=True)
|
||||
|
||||
if len(column_names) < 2:
|
||||
return {
|
||||
'success': False,
|
||||
'common_prefix': '',
|
||||
'separator': '',
|
||||
'timepoints': [],
|
||||
'confidence': 0.0,
|
||||
'message': '至少需要2列才能检测模式'
|
||||
}
|
||||
|
||||
# 打印前3个列名作为样本
|
||||
print(f" 样本列名:", flush=True)
|
||||
for i, col in enumerate(column_names[:3]):
|
||||
print(f" [{i+1}] {col}", flush=True)
|
||||
if len(column_names) > 3:
|
||||
print(f" ... 还有 {len(column_names) - 3} 列", flush=True)
|
||||
|
||||
# ==================== 1. 检测最长公共前缀 ====================
|
||||
common_prefix = os.path.commonprefix(column_names)
|
||||
print(f"\n ✓ 检测到公共前缀: '{common_prefix}'", flush=True)
|
||||
|
||||
if not common_prefix:
|
||||
return {
|
||||
'success': False,
|
||||
'common_prefix': '',
|
||||
'separator': '',
|
||||
'timepoints': [],
|
||||
'confidence': 0.0,
|
||||
'message': '未检测到公共前缀,选中的列可能不属于同一指标'
|
||||
}
|
||||
|
||||
# ==================== 2. 检测分隔符 ====================
|
||||
# 尝试常见分隔符(按优先级排序)
|
||||
separators = ['___', '__', '_', '-', '.', '|', ' - ', ' ']
|
||||
detected_separator = None
|
||||
|
||||
# 方法1:检查公共前缀是否以分隔符结尾
|
||||
for sep in separators:
|
||||
if common_prefix.endswith(sep):
|
||||
detected_separator = sep
|
||||
common_prefix = common_prefix[:-len(sep)] # 移除尾部分隔符
|
||||
print(f" ✓ 检测到分隔符: '{sep}' (位于公共前缀末尾)", flush=True)
|
||||
break
|
||||
|
||||
# 方法2:如果公共前缀末尾没有分隔符,尝试从剩余部分检测
|
||||
if not detected_separator:
|
||||
remainders = [col[len(common_prefix):] for col in column_names]
|
||||
for sep in separators:
|
||||
if all(r.startswith(sep) for r in remainders if r):
|
||||
detected_separator = sep
|
||||
print(f" ✓ 检测到分隔符: '{sep}' (位于剩余部分开头)", flush=True)
|
||||
break
|
||||
|
||||
# ✨ 方法3:智能修正 - 如果剩余部分仍包含分隔符,尝试扩展公共前缀
|
||||
if detected_separator:
|
||||
remainders = [col[len(common_prefix):] for col in column_names]
|
||||
|
||||
# 检查每个剩余部分,看分隔符前是否还有公共部分
|
||||
parts_before_sep = []
|
||||
for remainder in remainders:
|
||||
if detected_separator in remainder:
|
||||
# 找到第一个分隔符的位置
|
||||
sep_pos = remainder.find(detected_separator)
|
||||
part = remainder[:sep_pos]
|
||||
parts_before_sep.append(part)
|
||||
else:
|
||||
parts_before_sep.append('')
|
||||
|
||||
# 如果所有剩余部分在分隔符前都有内容,且内容相同,则扩展公共前缀
|
||||
if parts_before_sep and all(p == parts_before_sep[0] for p in parts_before_sep if p):
|
||||
additional_prefix = parts_before_sep[0]
|
||||
if additional_prefix:
|
||||
print(f" 🔄 智能修正: 扩展公共前缀 '{common_prefix}' → '{common_prefix}{additional_prefix}'", flush=True)
|
||||
common_prefix = common_prefix + additional_prefix
|
||||
|
||||
if not detected_separator:
|
||||
print(f" ⚠️ 未检测到明确分隔符,使用空字符串", flush=True)
|
||||
detected_separator = ''
|
||||
|
||||
# ==================== 3. 提取时间点 ====================
|
||||
if detected_separator:
|
||||
# ✨ 修复:正确移除分隔符(移除整个分隔符字符串,而不是lstrip)
|
||||
timepoints = []
|
||||
for col in column_names:
|
||||
remainder = col[len(common_prefix):]
|
||||
# 如果剩余部分以分隔符开头,移除它
|
||||
if remainder.startswith(detected_separator):
|
||||
timepoint = remainder[len(detected_separator):]
|
||||
else:
|
||||
timepoint = remainder
|
||||
timepoints.append(timepoint.strip())
|
||||
else:
|
||||
# 没有分隔符,整个剩余部分作为时间点
|
||||
timepoints = [col[len(common_prefix):].strip() for col in column_names]
|
||||
|
||||
print(f" ✓ 提取到 {len(timepoints)} 个时间点:", flush=True)
|
||||
for i, tp in enumerate(timepoints[:5]):
|
||||
print(f" [{i+1}] {tp}", flush=True)
|
||||
if len(timepoints) > 5:
|
||||
print(f" ... 还有 {len(timepoints) - 5} 个", flush=True)
|
||||
|
||||
# ==================== 4. 计算置信度 ====================
|
||||
confidence = 1.0
|
||||
|
||||
# 检查:时间点不能为空
|
||||
empty_count = sum(1 for tp in timepoints if not tp)
|
||||
if empty_count > 0:
|
||||
confidence -= 0.3
|
||||
print(f" ⚠️ 发现 {empty_count} 个空时间点,降低置信度", flush=True)
|
||||
|
||||
# 检查:时间点应该各不相同
|
||||
unique_timepoints = len(set(timepoints))
|
||||
if unique_timepoints < len(timepoints):
|
||||
confidence -= 0.2
|
||||
print(f" ⚠️ 时间点有重复,降低置信度", flush=True)
|
||||
|
||||
# 检查:公共前缀不应该太短
|
||||
if len(common_prefix) < 2:
|
||||
confidence -= 0.2
|
||||
print(f" ⚠️ 公共前缀过短,降低置信度", flush=True)
|
||||
|
||||
confidence = max(0.0, min(1.0, confidence))
|
||||
|
||||
print(f"\n 📊 检测置信度: {confidence:.0%}", flush=True)
|
||||
|
||||
# ==================== 5. 生成消息 ====================
|
||||
if confidence >= 0.8:
|
||||
message = f"成功检测:指标='{common_prefix}', 分隔符='{detected_separator}', {len(timepoints)}个时间点"
|
||||
elif confidence >= 0.5:
|
||||
message = f"检测成功但有警告,建议检查结果"
|
||||
else:
|
||||
message = f"检测置信度较低,建议手动指定参数"
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'common_prefix': common_prefix,
|
||||
'separator': detected_separator,
|
||||
'timepoints': timepoints,
|
||||
'confidence': confidence,
|
||||
'message': message
|
||||
}
|
||||
|
||||
|
||||
def apply_metric_time_transform(
|
||||
df: pd.DataFrame,
|
||||
id_vars: List[str],
|
||||
value_vars: List[str],
|
||||
metric_name: Optional[str] = None,
|
||||
separator: Optional[str] = None,
|
||||
timepoint_col_name: str = '时间点'
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
应用指标-时间表转换
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
id_vars: ID列(保持不变的列)
|
||||
value_vars: 值列(同一指标的多个时间点)
|
||||
metric_name: 指标名称(如果为None,则自动检测)
|
||||
separator: 分隔符(如果为None,则自动检测)
|
||||
timepoint_col_name: 时间点列的列名(默认:"时间点")
|
||||
|
||||
Returns:
|
||||
转换后的数据框
|
||||
|
||||
Examples:
|
||||
>>> df = pd.DataFrame({
|
||||
... 'Record_ID': [10, 11],
|
||||
... 'FMA___基线': [54, 16],
|
||||
... 'FMA___2周': [93, 31],
|
||||
... 'FMA___1月': [68, 72]
|
||||
... })
|
||||
>>> result = apply_metric_time_transform(
|
||||
... df,
|
||||
... id_vars=['Record_ID'],
|
||||
... value_vars=['FMA___基线', 'FMA___2周', 'FMA___1月']
|
||||
... )
|
||||
>>> result.columns.tolist()
|
||||
['Record_ID', '时间点', '基线', '2周', '1月']
|
||||
"""
|
||||
print("\n" + "="*60, flush=True)
|
||||
print("🔄 开始指标-时间表转换...", flush=True)
|
||||
print("="*60, flush=True)
|
||||
|
||||
# ==================== 参数验证 ====================
|
||||
if df.empty:
|
||||
print("⚠️ 输入数据框为空", flush=True)
|
||||
return df
|
||||
|
||||
if not id_vars:
|
||||
raise ValueError('❌ 至少需要选择1个ID列')
|
||||
|
||||
if len(value_vars) < 2:
|
||||
raise ValueError('❌ 至少需要选择2个值列')
|
||||
|
||||
# 验证列是否存在
|
||||
for col in id_vars + value_vars:
|
||||
if col not in df.columns:
|
||||
raise KeyError(f"❌ 列 '{col}' 不存在")
|
||||
|
||||
print(f"\n📊 转换前数据概况:", flush=True)
|
||||
print(f" - 总行数: {len(df)}", flush=True)
|
||||
print(f" - ID列: {len(id_vars)} 个 ({', '.join(id_vars)})", flush=True)
|
||||
print(f" - 值列: {len(value_vars)} 个", flush=True)
|
||||
|
||||
# ==================== 自动检测或使用指定参数 ====================
|
||||
if not metric_name or separator is None:
|
||||
print(f"\n🔍 自动检测模式...", flush=True)
|
||||
pattern = detect_common_pattern(value_vars)
|
||||
|
||||
if not pattern['success']:
|
||||
raise ValueError(f"❌ 自动检测失败: {pattern['message']}")
|
||||
|
||||
metric_name = metric_name or pattern['common_prefix']
|
||||
separator = separator if separator is not None else pattern['separator']
|
||||
timepoints = pattern['timepoints']
|
||||
|
||||
print(f"\n✅ 使用检测结果:", flush=True)
|
||||
print(f" - 指标名: '{metric_name}'", flush=True)
|
||||
print(f" - 分隔符: '{separator}'", flush=True)
|
||||
print(f" - 置信度: {pattern['confidence']:.0%}", flush=True)
|
||||
else:
|
||||
print(f"\n✅ 使用手动指定参数:", flush=True)
|
||||
print(f" - 指标名: '{metric_name}'", flush=True)
|
||||
print(f" - 分隔符: '{separator}'", flush=True)
|
||||
|
||||
# 手动拆分时间点
|
||||
timepoints = []
|
||||
for col in value_vars:
|
||||
if separator and separator in col:
|
||||
# 移除指标名和分隔符
|
||||
remainder = col.replace(metric_name, '', 1).lstrip(separator)
|
||||
timepoints.append(remainder)
|
||||
else:
|
||||
# 直接移除指标名
|
||||
remainder = col.replace(metric_name, '', 1)
|
||||
timepoints.append(remainder.strip())
|
||||
|
||||
# ==================== 构建结果DataFrame ====================
|
||||
print(f"\n🔨 开始构建结果数据...", flush=True)
|
||||
|
||||
result_rows = []
|
||||
|
||||
for idx, row in df.iterrows():
|
||||
result_row = {}
|
||||
|
||||
# 1. 复制ID列
|
||||
for id_col in id_vars:
|
||||
result_row[id_col] = row[id_col]
|
||||
|
||||
# 2. 添加时间点列(实际存储的是指标名)
|
||||
result_row[timepoint_col_name] = metric_name
|
||||
|
||||
# 3. 添加各个时间点的值作为独立列
|
||||
for original_col, timepoint in zip(value_vars, timepoints):
|
||||
result_row[timepoint] = row[original_col]
|
||||
|
||||
result_rows.append(result_row)
|
||||
|
||||
result_df = pd.DataFrame(result_rows)
|
||||
|
||||
# ==================== 调整列顺序 ====================
|
||||
# 顺序:ID列 + 时间点列 + 各时间点列
|
||||
column_order = id_vars + [timepoint_col_name] + timepoints
|
||||
result_df = result_df[column_order]
|
||||
|
||||
# ==================== 统计输出 ====================
|
||||
print(f"\n{'='*60}", flush=True)
|
||||
print(f"✅ 指标-时间表转换完成!", flush=True)
|
||||
print(f"{'='*60}", flush=True)
|
||||
print(f"📊 转换结果:", flush=True)
|
||||
print(f" - 总行数: {len(result_df)} (不变)", flush=True)
|
||||
print(f" - 总列数: {len(result_df.columns)} (ID列 + 时间点列 + {len(timepoints)}个时间点列)", flush=True)
|
||||
print(f" - 指标名: {metric_name}", flush=True)
|
||||
print(f" - 时间点: {', '.join(timepoints[:5])}{'...' if len(timepoints) > 5 else ''}", flush=True)
|
||||
|
||||
# 显示前3行示例
|
||||
print(f"\n 前3行数据示例:", flush=True)
|
||||
for idx, row in result_df.head(3).iterrows():
|
||||
row_preview = ' | '.join([f"{col}={row[col]}" for col in result_df.columns[:4]])
|
||||
print(f" [{idx}] {row_preview}...", flush=True)
|
||||
|
||||
return result_df
|
||||
|
||||
|
||||
def preview_metric_time_transform(
|
||||
df: pd.DataFrame,
|
||||
id_vars: List[str],
|
||||
value_vars: List[str],
|
||||
preview_rows: int = 5
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
预览指标-时间表转换结果(不实际执行完整转换)
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
id_vars: ID列
|
||||
value_vars: 值列
|
||||
preview_rows: 预览行数
|
||||
|
||||
Returns:
|
||||
{
|
||||
'pattern': {
|
||||
'common_prefix': str,
|
||||
'separator': str,
|
||||
'timepoints': List[str],
|
||||
'confidence': float
|
||||
},
|
||||
'original_shape': (rows, cols),
|
||||
'new_shape': (rows, cols),
|
||||
'preview_data': List[Dict],
|
||||
'estimated_change': str
|
||||
}
|
||||
"""
|
||||
# 检测模式
|
||||
pattern = detect_common_pattern(value_vars)
|
||||
|
||||
if not pattern['success']:
|
||||
return {
|
||||
'success': False,
|
||||
'error': pattern['message']
|
||||
}
|
||||
|
||||
# 对前几行执行转换
|
||||
preview_df = df.head(preview_rows)
|
||||
|
||||
try:
|
||||
result_preview = apply_metric_time_transform(
|
||||
preview_df,
|
||||
id_vars,
|
||||
value_vars,
|
||||
pattern['common_prefix'],
|
||||
pattern['separator']
|
||||
)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'pattern': pattern,
|
||||
'original_shape': (len(df), len(df.columns)),
|
||||
'new_shape': (len(df), len(id_vars) + 1 + len(pattern['timepoints'])),
|
||||
'preview_data': result_preview.to_dict('records'),
|
||||
'estimated_change': f"列数: {len(df.columns)} → {len(id_vars) + 1 + len(pattern['timepoints'])} (ID列 + 时间点列 + {len(pattern['timepoints'])}个时间点列)"
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
|
||||
# ==================== 多指标转换(方向1:时间点为行,指标为列)====================
|
||||
|
||||
def detect_metric_groups(
|
||||
column_names: List[str],
|
||||
separators: Optional[List[str]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
自动检测并分组多个指标的列
|
||||
|
||||
参数:
|
||||
column_names: 列名列表,例如 ['FMA总得分_基线', 'FMA总得分_随访1', 'ADL总分_基线', 'ADL总分_随访1']
|
||||
separators: 可选的分隔符列表,默认 ['___', '__', '_', '-', '.']
|
||||
|
||||
返回:
|
||||
{
|
||||
'success': bool,
|
||||
'metric_groups': {
|
||||
'FMA总得分': ['FMA总得分_基线', 'FMA总得分_随访1', ...],
|
||||
'ADL总分': ['ADL总分_基线', 'ADL总分_随访1', ...],
|
||||
...
|
||||
},
|
||||
'separator': str, # 检测到的分隔符
|
||||
'timepoints': ['基线', '随访1', ...], # 所有时间点(应该每个指标都一致)
|
||||
'confidence': float, # 置信度 0.0-1.0
|
||||
'message': str
|
||||
}
|
||||
"""
|
||||
print(f"\n🔍 开始自动检测多指标分组...", flush=True)
|
||||
print(f" 输入列数: {len(column_names)}", flush=True)
|
||||
|
||||
if len(column_names) < 2:
|
||||
return {
|
||||
'success': False,
|
||||
'metric_groups': {},
|
||||
'separator': '',
|
||||
'timepoints': [],
|
||||
'confidence': 0.0,
|
||||
'message': '至少需要2列才能检测分组'
|
||||
}
|
||||
|
||||
if separators is None:
|
||||
separators = ['___', '__', '_', '-', '.', '|', ' - ', ' ']
|
||||
|
||||
# ==================== 1. 尝试每个分隔符 ====================
|
||||
detected_separator = None
|
||||
metric_groups = defaultdict(list)
|
||||
|
||||
for sep in separators:
|
||||
temp_groups = defaultdict(list)
|
||||
failed = False
|
||||
|
||||
for col in column_names:
|
||||
if sep not in col:
|
||||
failed = True
|
||||
break
|
||||
|
||||
# 分割列名
|
||||
parts = col.split(sep)
|
||||
if len(parts) < 2:
|
||||
failed = True
|
||||
break
|
||||
|
||||
# 第一部分作为指标名
|
||||
metric_name = parts[0]
|
||||
temp_groups[metric_name].append(col)
|
||||
|
||||
if not failed and len(temp_groups) > 0:
|
||||
detected_separator = sep
|
||||
metric_groups = temp_groups
|
||||
print(f" ✓ 检测到分隔符: '{sep}'", flush=True)
|
||||
break
|
||||
|
||||
if not detected_separator:
|
||||
return {
|
||||
'success': False,
|
||||
'metric_groups': {},
|
||||
'separator': '',
|
||||
'timepoints': [],
|
||||
'confidence': 0.0,
|
||||
'message': '未检测到公共分隔符,请确认选中的列格式一致'
|
||||
}
|
||||
|
||||
# ==================== 2. 提取每个指标的时间点 ====================
|
||||
metric_timepoints = {}
|
||||
|
||||
for metric_name, cols in metric_groups.items():
|
||||
timepoints = []
|
||||
for col in cols:
|
||||
# 提取时间点(分隔符后的部分)
|
||||
parts = col.split(detected_separator)
|
||||
if len(parts) >= 2:
|
||||
# 使用最后一部分作为时间点(支持多级分隔,如 "FMA总得分_子项_基线")
|
||||
timepoint = parts[-1].strip()
|
||||
timepoints.append(timepoint)
|
||||
|
||||
metric_timepoints[metric_name] = timepoints
|
||||
|
||||
print(f" ✓ 检测到 {len(metric_groups)} 个指标:", flush=True)
|
||||
for metric_name, cols in metric_groups.items():
|
||||
print(f" • {metric_name} ({len(cols)}列)", flush=True)
|
||||
|
||||
# ==================== 3. 验证时间点一致性 ====================
|
||||
# 检查所有指标的时间点是否相同
|
||||
all_timepoints = list(metric_timepoints.values())
|
||||
first_timepoints = all_timepoints[0]
|
||||
|
||||
consistent = True
|
||||
for tp_list in all_timepoints[1:]:
|
||||
if tp_list != first_timepoints:
|
||||
consistent = False
|
||||
break
|
||||
|
||||
if not consistent:
|
||||
print(f" ⚠️ 警告: 各指标的时间点不完全一致", flush=True)
|
||||
# 使用所有时间点的并集
|
||||
all_unique_timepoints = sorted(set(tp for tp_list in all_timepoints for tp in tp_list))
|
||||
confidence = 0.6
|
||||
message = f"检测到{len(metric_groups)}个指标,但时间点不完全一致。将使用所有时间点的并集,缺失值将填充为NA。"
|
||||
else:
|
||||
all_unique_timepoints = first_timepoints
|
||||
confidence = 1.0
|
||||
message = f"成功检测到{len(metric_groups)}个指标,共{len(all_unique_timepoints)}个时间点"
|
||||
|
||||
print(f" ✓ 检测到 {len(all_unique_timepoints)} 个时间点:", flush=True)
|
||||
for i, tp in enumerate(all_unique_timepoints[:5]):
|
||||
print(f" [{i+1}] {tp}", flush=True)
|
||||
if len(all_unique_timepoints) > 5:
|
||||
print(f" ... 还有 {len(all_unique_timepoints) - 5} 个", flush=True)
|
||||
|
||||
# ==================== 4. 计算置信度 ====================
|
||||
# 检查:每个指标的列数是否相同
|
||||
column_counts = [len(cols) for cols in metric_groups.values()]
|
||||
if len(set(column_counts)) > 1:
|
||||
confidence -= 0.2
|
||||
print(f" ⚠️ 各指标的列数不同,降低置信度", flush=True)
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'metric_groups': dict(metric_groups),
|
||||
'separator': detected_separator,
|
||||
'timepoints': all_unique_timepoints,
|
||||
'confidence': confidence,
|
||||
'message': message
|
||||
}
|
||||
|
||||
|
||||
def apply_multi_metric_to_long(
|
||||
df: pd.DataFrame,
|
||||
id_vars: List[str],
|
||||
metric_groups: Dict[str, List[str]],
|
||||
separator: str,
|
||||
event_col_name: str = 'Event_Name'
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
多指标转长表:时间点为行,指标为列
|
||||
|
||||
参数:
|
||||
df: 原始数据框
|
||||
id_vars: ID列列表
|
||||
metric_groups: 指标分组字典,格式 {'FMA总得分': ['FMA总得分_基线', ...], ...}
|
||||
separator: 分隔符
|
||||
event_col_name: 时间点列的列名
|
||||
|
||||
返回:
|
||||
转换后的数据框
|
||||
|
||||
示例:
|
||||
输入:
|
||||
Record_ID | FMA总得分_基线 | FMA总得分_随访1 | ADL总分_基线 | ADL总分_随访1
|
||||
10 | 58 | 67 | 值1 | 值2
|
||||
|
||||
输出:
|
||||
Record_ID | Event_Name | FMA总得分 | ADL总分
|
||||
10 | 基线 | 58 | 值1
|
||||
10 | 随访1 | 67 | 值2
|
||||
"""
|
||||
print(f"\n🔄 开始多指标转长表转换...", flush=True)
|
||||
print(f" 原始形状: {df.shape}", flush=True)
|
||||
print(f" ID列: {id_vars}", flush=True)
|
||||
print(f" 指标数: {len(metric_groups)}", flush=True)
|
||||
|
||||
# ✨ 记录原始行的顺序(保持原始Record ID顺序)
|
||||
df = df.copy()
|
||||
df['_original_order'] = range(len(df))
|
||||
|
||||
# ==================== 1. 对每个指标执行 melt ====================
|
||||
melted_dfs = []
|
||||
|
||||
for metric_name, cols in metric_groups.items():
|
||||
print(f" • 处理指标: {metric_name} ({len(cols)}列)", flush=True)
|
||||
|
||||
# 提取该指标的数据(包含原始顺序列)
|
||||
df_metric = df[id_vars + ['_original_order'] + cols].copy()
|
||||
|
||||
# Melt(保留原始顺序列)
|
||||
df_melted = df_metric.melt(
|
||||
id_vars=id_vars + ['_original_order'],
|
||||
value_vars=cols,
|
||||
var_name='_temp_col',
|
||||
value_name=metric_name
|
||||
)
|
||||
|
||||
# 提取时间点(移除分隔符前的指标名部分)
|
||||
df_melted[event_col_name] = df_melted['_temp_col'].apply(
|
||||
lambda x: x.split(separator)[-1].strip() if separator in x else x
|
||||
)
|
||||
|
||||
# 删除临时列
|
||||
df_melted = df_melted.drop('_temp_col', axis=1)
|
||||
|
||||
melted_dfs.append(df_melted)
|
||||
|
||||
# ==================== 2. Merge所有指标 ====================
|
||||
print(f" • 合并 {len(melted_dfs)} 个指标的数据...", flush=True)
|
||||
|
||||
result = melted_dfs[0]
|
||||
for i, df_metric in enumerate(melted_dfs[1:], 1):
|
||||
result = result.merge(
|
||||
df_metric,
|
||||
on=id_vars + ['_original_order', event_col_name],
|
||||
how='outer' # 外连接,保留所有时间点
|
||||
)
|
||||
|
||||
# ==================== 3. 排序 ====================
|
||||
# ✨ 按原始顺序和时间点排序(保持原始Record ID顺序)
|
||||
result = result.sort_values(by=['_original_order', event_col_name]).reset_index(drop=True)
|
||||
|
||||
# 删除临时的原始顺序列
|
||||
result = result.drop('_original_order', axis=1)
|
||||
|
||||
# ==================== 4. 调整列顺序 ====================
|
||||
# 确保列顺序为:ID列 → Event_Name → 所有指标列
|
||||
metric_cols = [col for col in result.columns if col not in id_vars and col != event_col_name]
|
||||
desired_column_order = id_vars + [event_col_name] + metric_cols
|
||||
result = result[desired_column_order]
|
||||
|
||||
print(f" ✓ 转换完成!新形状: {result.shape}", flush=True)
|
||||
print(f" ✓ 列顺序: {list(result.columns)}", flush=True)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def preview_multi_metric_to_long(
|
||||
df: pd.DataFrame,
|
||||
id_vars: List[str],
|
||||
value_vars: List[str],
|
||||
separators: Optional[List[str]] = None,
|
||||
event_col_name: str = 'Event_Name',
|
||||
preview_rows: int = 10
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
预览多指标转长表的结果
|
||||
|
||||
返回:
|
||||
{
|
||||
'success': bool,
|
||||
'grouping': {...}, # detect_metric_groups的结果
|
||||
'original_shape': (rows, cols),
|
||||
'new_shape': (rows, cols),
|
||||
'preview_data': [...],
|
||||
'estimated_change': str
|
||||
}
|
||||
"""
|
||||
print(f"\n📊 预览多指标转长表...", flush=True)
|
||||
|
||||
# 1. 检测分组
|
||||
grouping = detect_metric_groups(value_vars, separators)
|
||||
|
||||
if not grouping['success']:
|
||||
return {
|
||||
'success': False,
|
||||
'error': grouping['message']
|
||||
}
|
||||
|
||||
# 2. 对前几行执行转换
|
||||
preview_df = df.head(preview_rows)
|
||||
|
||||
try:
|
||||
result_preview = apply_multi_metric_to_long(
|
||||
preview_df,
|
||||
id_vars,
|
||||
grouping['metric_groups'],
|
||||
grouping['separator'],
|
||||
event_col_name
|
||||
)
|
||||
|
||||
num_metrics = len(grouping['metric_groups'])
|
||||
num_timepoints = len(grouping['timepoints'])
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'grouping': grouping,
|
||||
'original_shape': (len(df), len(df.columns)),
|
||||
'new_shape': (len(df) * num_timepoints, len(id_vars) + 1 + num_metrics),
|
||||
'preview_data': result_preview.to_dict('records'),
|
||||
'estimated_change': f"行数: {len(df)} → {len(df) * num_timepoints} (每个ID复制{num_timepoints}次); 列数: {len(df.columns)} → {len(id_vars) + 1 + num_metrics} (ID列 + 时间点列 + {num_metrics}个指标列)"
|
||||
}
|
||||
except Exception as e:
|
||||
import traceback
|
||||
print(f" ❌ 预览失败: {str(e)}", flush=True)
|
||||
traceback.print_exc()
|
||||
return {
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
|
||||
# ==================== 多指标转换(方向2:时间点为列,指标为行)====================
|
||||
|
||||
def apply_multi_metric_to_matrix(
|
||||
df: pd.DataFrame,
|
||||
id_vars: List[str],
|
||||
metric_groups: Dict[str, List[str]],
|
||||
separator: str,
|
||||
event_col_name: str = 'Event_Name',
|
||||
metric_col_name: str = '指标名'
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
多指标转矩阵格式:时间点为列,指标为行
|
||||
|
||||
参数:
|
||||
df: 原始数据框
|
||||
id_vars: ID列列表
|
||||
metric_groups: 指标分组字典
|
||||
separator: 分隔符
|
||||
event_col_name: 时间点列的列名(中间变量)
|
||||
metric_col_name: 指标列的列名
|
||||
|
||||
返回:
|
||||
转换后的数据框
|
||||
|
||||
示例:
|
||||
输入:
|
||||
Record_ID | FMA总得分_基线 | FMA总得分_随访1 | ADL总分_基线 | ADL总分_随访1
|
||||
10 | 58 | 67 | 值1 | 值2
|
||||
|
||||
输出:
|
||||
Record_ID | 指标名 | 基线 | 随访1
|
||||
10 | FMA总得分 | 58 | 67
|
||||
10 | ADL总分 | 值1 | 值2
|
||||
"""
|
||||
print(f"\n🔄 开始多指标转矩阵格式...", flush=True)
|
||||
print(f" 原始形状: {df.shape}", flush=True)
|
||||
print(f" ID列: {id_vars}", flush=True)
|
||||
print(f" 指标数: {len(metric_groups)}", flush=True)
|
||||
|
||||
# ✨ 记录原始行的顺序(保持原始Record ID顺序)
|
||||
# 创建ID到原始顺序的映射
|
||||
df_with_order = df.copy()
|
||||
df_with_order['_original_order'] = range(len(df_with_order))
|
||||
|
||||
# 创建ID列到原始顺序的映射字典
|
||||
# 如果有多个ID列,使用元组作为key
|
||||
if len(id_vars) == 1:
|
||||
id_to_order = df_with_order.set_index(id_vars[0])['_original_order'].to_dict()
|
||||
else:
|
||||
id_to_order = df_with_order.set_index(id_vars)['_original_order'].to_dict()
|
||||
|
||||
# ==================== 1. 先转成长表 ====================
|
||||
df_long = apply_multi_metric_to_long(
|
||||
df,
|
||||
id_vars,
|
||||
metric_groups,
|
||||
separator,
|
||||
event_col_name
|
||||
)
|
||||
|
||||
print(f" • 长表形状: {df_long.shape}", flush=True)
|
||||
|
||||
# ==================== 2. 转成宽格式(指标为行,时间点为列)====================
|
||||
# 先melt所有指标列,变成 (ID, Event_Name, 指标名, 值) 格式
|
||||
metric_cols = [col for col in df_long.columns if col not in id_vars and col != event_col_name]
|
||||
|
||||
print(f" • 准备pivot: {len(metric_cols)} 个指标列", flush=True)
|
||||
|
||||
# Melt:将所有指标列转为行
|
||||
df_melted = df_long.melt(
|
||||
id_vars=id_vars + [event_col_name],
|
||||
value_vars=metric_cols,
|
||||
var_name=metric_col_name,
|
||||
value_name='_value'
|
||||
)
|
||||
|
||||
print(f" • Melt后形状: {df_melted.shape}", flush=True)
|
||||
|
||||
# Pivot:时间点变成列
|
||||
# 使用 pivot_table 而不是 pivot,因为可能有重复索引
|
||||
result = df_melted.pivot_table(
|
||||
index=id_vars + [metric_col_name],
|
||||
columns=event_col_name,
|
||||
values='_value',
|
||||
aggfunc='first' # 如果有重复,取第一个值
|
||||
).reset_index()
|
||||
|
||||
# 清理列名(移除多级索引的名称)
|
||||
result.columns.name = None
|
||||
|
||||
# ✨ 添加原始顺序列(用于排序)
|
||||
if len(id_vars) == 1:
|
||||
result['_original_order'] = result[id_vars[0]].map(id_to_order)
|
||||
else:
|
||||
# 多个ID列的情况,创建元组作为key
|
||||
result['_original_order'] = result[id_vars].apply(tuple, axis=1).map(id_to_order)
|
||||
|
||||
# ==================== 3. 调整列顺序 ====================
|
||||
# 确保列顺序为:ID列 → 指标名列 → 所有时间点列(按原始顺序)
|
||||
timepoint_cols = [col for col in result.columns if col not in id_vars and col != metric_col_name]
|
||||
|
||||
# 尝试保持时间点的原始顺序(从 metric_groups 中获取)
|
||||
first_metric_cols = list(metric_groups.values())[0]
|
||||
original_timepoint_order = []
|
||||
for col in first_metric_cols:
|
||||
timepoint = col.split(separator)[-1].strip() if separator in col else col
|
||||
if timepoint not in original_timepoint_order:
|
||||
original_timepoint_order.append(timepoint)
|
||||
|
||||
# 按原始顺序排列时间点列
|
||||
sorted_timepoint_cols = []
|
||||
for tp in original_timepoint_order:
|
||||
if tp in timepoint_cols:
|
||||
sorted_timepoint_cols.append(tp)
|
||||
# 添加任何未在原始顺序中的时间点(防御性编程)
|
||||
for tp in timepoint_cols:
|
||||
if tp not in sorted_timepoint_cols:
|
||||
sorted_timepoint_cols.append(tp)
|
||||
|
||||
# ==================== 4. 排序 ====================
|
||||
# ✨ 按原始顺序和指标名排序(保持原始Record ID顺序)
|
||||
result = result.sort_values(by=['_original_order', metric_col_name]).reset_index(drop=True)
|
||||
|
||||
# 删除临时的原始顺序列
|
||||
result = result.drop('_original_order', axis=1)
|
||||
|
||||
# ==================== 5. 调整列顺序 ====================
|
||||
desired_column_order = id_vars + [metric_col_name] + sorted_timepoint_cols
|
||||
result = result[desired_column_order]
|
||||
|
||||
print(f" ✓ 转换完成!新形状: {result.shape}", flush=True)
|
||||
print(f" ✓ 列顺序: {list(result.columns)}", flush=True)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def preview_multi_metric_to_matrix(
|
||||
df: pd.DataFrame,
|
||||
id_vars: List[str],
|
||||
value_vars: List[str],
|
||||
separators: Optional[List[str]] = None,
|
||||
metric_col_name: str = '指标名',
|
||||
preview_rows: int = 10
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
预览多指标转矩阵格式的结果
|
||||
|
||||
返回:
|
||||
{
|
||||
'success': bool,
|
||||
'grouping': {...}, # detect_metric_groups的结果
|
||||
'original_shape': (rows, cols),
|
||||
'new_shape': (rows, cols),
|
||||
'preview_data': [...],
|
||||
'estimated_change': str
|
||||
}
|
||||
"""
|
||||
print(f"\n📊 预览多指标转矩阵格式...", flush=True)
|
||||
|
||||
# 1. 检测分组
|
||||
grouping = detect_metric_groups(value_vars, separators)
|
||||
|
||||
if not grouping['success']:
|
||||
return {
|
||||
'success': False,
|
||||
'error': grouping['message']
|
||||
}
|
||||
|
||||
# 2. 对前几行执行转换
|
||||
preview_df = df.head(preview_rows)
|
||||
|
||||
try:
|
||||
result_preview = apply_multi_metric_to_matrix(
|
||||
preview_df,
|
||||
id_vars,
|
||||
grouping['metric_groups'],
|
||||
grouping['separator'],
|
||||
'Event_Name',
|
||||
metric_col_name
|
||||
)
|
||||
|
||||
num_metrics = len(grouping['metric_groups'])
|
||||
num_timepoints = len(grouping['timepoints'])
|
||||
|
||||
# 新行数 = 原始行数 × 指标数
|
||||
estimated_new_rows = len(df) * num_metrics
|
||||
# 新列数 = ID列数 + 1(指标名列)+ 时间点数
|
||||
estimated_new_cols = len(id_vars) + 1 + num_timepoints
|
||||
|
||||
return {
|
||||
'success': True,
|
||||
'grouping': grouping,
|
||||
'original_shape': (len(df), len(df.columns)),
|
||||
'new_shape': (estimated_new_rows, estimated_new_cols),
|
||||
'preview_data': result_preview.to_dict('records'),
|
||||
'estimated_change': f"行数: {len(df)} → {estimated_new_rows} (每个ID复制{num_metrics}次,每个指标1行); 列数: {len(df.columns)} → {estimated_new_cols} (ID列 + 指标名列 + {num_timepoints}个时间点列)"
|
||||
}
|
||||
except Exception as e:
|
||||
import traceback
|
||||
print(f" ❌ 预览失败: {str(e)}", flush=True)
|
||||
traceback.print_exc()
|
||||
return {
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
289
extraction_service/operations/unpivot.py
Normal file
289
extraction_service/operations/unpivot.py
Normal file
@@ -0,0 +1,289 @@
|
||||
"""
|
||||
宽表转长表(Unpivot/Melt)操作
|
||||
|
||||
提供数据重塑功能,将宽格式转换为长格式。
|
||||
典型医学场景:
|
||||
- 多时间点随访数据(FMA_基线、FMA_2周 → 时间点列 + FMA值列)
|
||||
- 多指标合并分析(收缩压、舒张压 → 指标列 + 测量值列)
|
||||
- 治疗组对比(治疗组_NRS、对照组_NRS → 组别列 + NRS列)
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import List, Optional, Dict, Any
|
||||
import sys
|
||||
|
||||
|
||||
def apply_unpivot(
|
||||
df: pd.DataFrame,
|
||||
id_vars: List[str],
|
||||
value_vars: List[str],
|
||||
var_name: str = '变量',
|
||||
value_name: str = '值',
|
||||
parse_column_names: bool = False,
|
||||
separator: str = '_',
|
||||
metric_name: Optional[str] = None,
|
||||
time_name: Optional[str] = None,
|
||||
dropna: bool = False
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
应用宽表转长表转换
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
id_vars: ID列(保持不变的列)
|
||||
value_vars: 值列(需要转换的列)
|
||||
var_name: 变量名列名(存储原列名)
|
||||
value_name: 值列名(存储实际值)
|
||||
parse_column_names: 是否解析列名(如"FMA_基线"→"FMA"+"基线")
|
||||
separator: 列名分隔符
|
||||
metric_name: 指标列名(解析列名时使用)
|
||||
time_name: 时间列名(解析列名时使用)
|
||||
dropna: 是否删除缺失值行
|
||||
|
||||
Returns:
|
||||
转换后的长格式数据框
|
||||
|
||||
Examples:
|
||||
>>> # 场景1:多时间点随访数据
|
||||
>>> df = pd.DataFrame({
|
||||
... '患者ID': ['P001', 'P002'],
|
||||
... '性别': ['男', '女'],
|
||||
... 'FMA_基线': [32, 28],
|
||||
... 'FMA_2周': [45, 38],
|
||||
... 'FMA_1月': [52, 44]
|
||||
... })
|
||||
>>> result = apply_unpivot(
|
||||
... df,
|
||||
... id_vars=['患者ID', '性别'],
|
||||
... value_vars=['FMA_基线', 'FMA_2周', 'FMA_1月'],
|
||||
... var_name='时间点',
|
||||
... value_name='FMA值'
|
||||
... )
|
||||
>>> len(result) # 2人 × 3个时间点 = 6行
|
||||
6
|
||||
>>> result.columns.tolist()
|
||||
['患者ID', '性别', '时间点', 'FMA值']
|
||||
|
||||
>>> # 场景2:带列名解析
|
||||
>>> result = apply_unpivot(
|
||||
... df,
|
||||
... id_vars=['患者ID', '性别'],
|
||||
... value_vars=['FMA_基线', 'FMA_2周', 'FMA_1月'],
|
||||
... parse_column_names=True,
|
||||
... separator='_',
|
||||
... metric_name='指标',
|
||||
... time_name='时间点',
|
||||
... value_name='测量值'
|
||||
... )
|
||||
>>> result.columns.tolist()
|
||||
['患者ID', '性别', '指标', '时间点', '测量值']
|
||||
>>> result['指标'].unique().tolist()
|
||||
['FMA']
|
||||
>>> result['时间点'].unique().tolist()
|
||||
['基线', '2周', '1月']
|
||||
"""
|
||||
print("\n" + "="*60, flush=True)
|
||||
print("🔄 开始宽表转长表转换...", flush=True)
|
||||
print("="*60, flush=True)
|
||||
|
||||
# ==================== 参数验证 ====================
|
||||
|
||||
if df.empty:
|
||||
print("⚠️ 输入数据框为空", flush=True)
|
||||
return df
|
||||
|
||||
if not id_vars:
|
||||
raise ValueError('❌ 至少需要选择1个ID列(标识列)')
|
||||
|
||||
if len(value_vars) < 2:
|
||||
raise ValueError('❌ 至少需要选择2个值列(需要转换的列)')
|
||||
|
||||
# 验证列是否存在
|
||||
missing_id_cols = [col for col in id_vars if col not in df.columns]
|
||||
if missing_id_cols:
|
||||
raise KeyError(f"❌ ID列不存在: {', '.join(missing_id_cols)}")
|
||||
|
||||
missing_value_cols = [col for col in value_vars if col not in df.columns]
|
||||
if missing_value_cols:
|
||||
raise KeyError(f"❌ 值列不存在: {', '.join(missing_value_cols)}")
|
||||
|
||||
# 检查ID列和值列是否有重复
|
||||
overlap = set(id_vars) & set(value_vars)
|
||||
if overlap:
|
||||
raise ValueError(f"❌ ID列和值列不能重复: {', '.join(overlap)}")
|
||||
|
||||
print(f"\n📊 转换前数据概况:", flush=True)
|
||||
print(f" - 总行数: {len(df)}", flush=True)
|
||||
print(f" - 总列数: {len(df.columns)}", flush=True)
|
||||
print(f" - ID列: {len(id_vars)} 个 ({', '.join(id_vars[:3])}{'...' if len(id_vars) > 3 else ''})", flush=True)
|
||||
print(f" - 值列: {len(value_vars)} 个 ({', '.join(value_vars[:3])}{'...' if len(value_vars) > 3 else ''})", flush=True)
|
||||
|
||||
# ==================== 基础转换(使用pandas.melt)====================
|
||||
|
||||
try:
|
||||
result = pd.melt(
|
||||
df,
|
||||
id_vars=id_vars,
|
||||
value_vars=value_vars,
|
||||
var_name=var_name,
|
||||
value_name=value_name
|
||||
)
|
||||
|
||||
print(f"\n✅ 基础转换完成:", flush=True)
|
||||
print(f" - 转换后行数: {len(result)} (原 {len(df)} × {len(value_vars)})", flush=True)
|
||||
print(f" - 转换后列数: {len(result.columns)} (ID列 + 变量名列 + 值列)", flush=True)
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 转换失败: {str(e)}", flush=True)
|
||||
raise
|
||||
|
||||
# ==================== 高级功能:解析列名 ====================
|
||||
|
||||
if parse_column_names and separator:
|
||||
print(f"\n🔍 开始解析列名(分隔符: '{separator}')...", flush=True)
|
||||
|
||||
def parse_column_name(name: str):
|
||||
"""
|
||||
解析列名
|
||||
|
||||
Examples:
|
||||
"FMA_基线" → ("FMA", "基线")
|
||||
"血压_1月" → ("血压", "1月")
|
||||
"NRS_治疗组_2周" → ("NRS", "治疗组_2周")
|
||||
"""
|
||||
parts = name.split(separator)
|
||||
if len(parts) >= 2:
|
||||
metric = parts[0]
|
||||
time = separator.join(parts[1:])
|
||||
return metric, time
|
||||
else:
|
||||
# 没有分隔符,整个作为指标名,时间点留空
|
||||
return name, ''
|
||||
|
||||
try:
|
||||
# 应用解析函数
|
||||
parsed = result[var_name].apply(parse_column_name)
|
||||
|
||||
# 创建新列
|
||||
metric_col = metric_name or '指标'
|
||||
time_col = time_name or '时间点'
|
||||
|
||||
result[metric_col] = parsed.str[0]
|
||||
result[time_col] = parsed.str[1]
|
||||
|
||||
# 删除原变量名列(已经拆分了)
|
||||
result = result.drop(columns=[var_name])
|
||||
|
||||
# 统计解析结果
|
||||
unique_metrics = result[metric_col].nunique()
|
||||
unique_times = result[time_col].nunique()
|
||||
|
||||
print(f"✅ 列名解析完成:", flush=True)
|
||||
print(f" - {metric_col}列: {unique_metrics} 个唯一值", flush=True)
|
||||
print(f" - {time_col}列: {unique_times} 个唯一值", flush=True)
|
||||
|
||||
# 显示前3个解析示例
|
||||
sample_original = value_vars[:3]
|
||||
print(f"\n 解析示例:", flush=True)
|
||||
for orig in sample_original:
|
||||
metric, time = parse_column_name(orig)
|
||||
print(f" - '{orig}' → {metric_col}='{metric}', {time_col}='{time}'", flush=True)
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ 列名解析失败: {str(e)}", flush=True)
|
||||
print(f" 已保留原变量名列: {var_name}", flush=True)
|
||||
|
||||
# ==================== 删除缺失值行 ====================
|
||||
|
||||
if dropna:
|
||||
original_len = len(result)
|
||||
result = result.dropna(subset=[value_name])
|
||||
dropped = original_len - len(result)
|
||||
|
||||
if dropped > 0:
|
||||
print(f"\n🗑️ 删除缺失值行: {dropped} 行 ({dropped/original_len*100:.1f}%)", flush=True)
|
||||
|
||||
# ==================== 排序 ====================
|
||||
|
||||
# 排序:按ID列排序(保持患者分组)
|
||||
result = result.sort_values(id_vars).reset_index(drop=True)
|
||||
|
||||
print(f"\n✅ 排序完成: 按 {', '.join(id_vars[:2])}{'...' if len(id_vars) > 2 else ''} 排序", flush=True)
|
||||
|
||||
# ==================== 最终统计 ====================
|
||||
|
||||
print(f"\n{'='*60}", flush=True)
|
||||
print(f"✅ 宽表转长表转换完成!", flush=True)
|
||||
print(f"{'='*60}", flush=True)
|
||||
print(f"📊 最终数据:", flush=True)
|
||||
print(f" - 总行数: {len(result)} (扩展了 {len(result)/len(df):.1f}x)", flush=True)
|
||||
print(f" - 总列数: {len(result.columns)}", flush=True)
|
||||
print(f" - 列名: {', '.join(result.columns.tolist())}", flush=True)
|
||||
|
||||
# 显示前3行示例
|
||||
print(f"\n 前3行数据示例:", flush=True)
|
||||
for idx, row in result.head(3).iterrows():
|
||||
row_str = ' | '.join([f"{col}={row[col]}" for col in result.columns[:4]])
|
||||
print(f" [{idx}] {row_str}...", flush=True)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_unpivot_preview(
|
||||
df: pd.DataFrame,
|
||||
id_vars: List[str],
|
||||
value_vars: List[str],
|
||||
var_name: str = '变量',
|
||||
value_name: str = '值',
|
||||
preview_rows: int = 10
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
获取转换预览信息(不实际执行完整转换)
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
id_vars: ID列
|
||||
value_vars: 值列
|
||||
var_name: 变量名列名
|
||||
value_name: 值列名
|
||||
preview_rows: 预览行数
|
||||
|
||||
Returns:
|
||||
{
|
||||
'original_shape': (rows, cols),
|
||||
'new_shape': (rows, cols),
|
||||
'expansion_factor': 扩展倍数,
|
||||
'preview_data': 前N行数据,
|
||||
'estimated_change': '将从 100 行 × 15 列 转换为 500 行 × 5 列'
|
||||
}
|
||||
"""
|
||||
original_rows = len(df)
|
||||
original_cols = len(df.columns)
|
||||
|
||||
# 预估转换后的形状
|
||||
new_rows = original_rows * len(value_vars)
|
||||
new_cols = len(id_vars) + 2 # ID列 + 变量名列 + 值列
|
||||
|
||||
expansion_factor = len(value_vars)
|
||||
|
||||
# 生成前几行预览
|
||||
preview_df = df.head(min(3, len(df)))
|
||||
preview_result = pd.melt(
|
||||
preview_df,
|
||||
id_vars=id_vars,
|
||||
value_vars=value_vars,
|
||||
var_name=var_name,
|
||||
value_name=value_name
|
||||
)
|
||||
|
||||
return {
|
||||
'original_shape': (original_rows, original_cols),
|
||||
'new_shape': (new_rows, new_cols),
|
||||
'expansion_factor': expansion_factor,
|
||||
'preview_data': preview_result.head(preview_rows).to_dict('records'),
|
||||
'estimated_change': f"将从 {original_rows} 行 × {original_cols} 列 转换为 {new_rows} 行 × {new_cols} 列"
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -291,3 +291,9 @@ if __name__ == "__main__":
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -57,3 +57,9 @@ except Exception as e:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -37,3 +37,9 @@ except Exception as e:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -519,6 +519,12 @@ export default FulltextDetailDrawer;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -118,6 +118,12 @@ export function useFulltextResults({
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -81,6 +81,12 @@ export function useFulltextTask({
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -472,6 +472,12 @@ export default FulltextResults;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -116,6 +116,12 @@ export const useAssets = (activeTab: AssetTabType) => {
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -106,6 +106,12 @@ export const useRecentTasks = () => {
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -345,3 +345,9 @@ export default BinningDialog;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user