From 9b81aef9a7776cab2dcf938d8129fe5d6be90c7d Mon Sep 17 00:00:00 2001 From: HaHafeng Date: Sun, 21 Dec 2025 15:06:15 +0800 Subject: [PATCH] feat(dc): Add multi-metric transformation feature (direction 1+2) Summary: - Implement intelligent multi-metric grouping detection algorithm - Add direction 1: timepoint-as-row, metric-as-column (analysis format) - Add direction 2: timepoint-as-column, metric-as-row (display format) - Fix column name pattern detection (FMA___ issue) - Maintain original Record ID order in output - Add full-select/clear buttons in UI - Integrate into TransformDialog with Radio selection - Update 3 documentation files Technical Details: - Python: detect_metric_groups(), apply_multi_metric_to_long(), apply_multi_metric_to_matrix() - Backend: 3 new methods in QuickActionService - Frontend: MultiMetricPanel.tsx (531 lines) - Total: ~1460 lines of new code Status: Fully tested and verified, ready for production --- DC模块代码恢复指南.md | 6 + .../add_data_stats_to_tool_c_session.sql | 6 + .../001_add_postgres_cache_and_checkpoint.sql | 6 + .../manual-migrations/run-migration-002.ts | 6 + .../20251208_add_column_mapping/migration.sql | 6 + .../migrations/create_tool_c_session.sql | 6 + backend/recover-code-from-cursor-db.js | 6 + backend/scripts/check-dc-tables.mjs | 6 + .../create-tool-c-ai-history-table.mjs | 6 + backend/scripts/create-tool-c-table.js | 6 + backend/scripts/create-tool-c-table.mjs | 6 + backend/src/common/jobs/utils.ts | 6 + .../__tests__/api-integration-test.ts | 6 + .../__tests__/e2e-real-test-v2.ts | 6 + .../__tests__/fulltext-screening-api.http | 6 + .../services/ExcelExporter.ts | 6 + .../services/ConflictDetectionService.ts | 6 + .../dc/tool-b/services/TemplateService.ts | 6 + .../dc/tool-b/workers/extractionWorker.ts | 6 + backend/src/modules/dc/tool-c/README.md | 6 + .../controllers/QuickActionController.ts | 168 +++- .../tool-c/controllers/StreamAIController.ts | 6 + backend/src/modules/dc/tool-c/routes/index.ts | 14 + .../dc/tool-c/services/QuickActionService.ts | 265 +++++ backend/src/tests/README.md | 6 + backend/src/tests/verify-test1-database.sql | 6 + backend/src/tests/verify-test1-database.ts | 6 + backend/src/types/global.d.ts | 6 + backend/sync-dc-database.ps1 | 6 + backend/test-tool-c-advanced-scenarios.mjs | 6 + backend/test-tool-c-day2.mjs | 6 + backend/test-tool-c-day3.mjs | 6 + deploy-to-sae.ps1 | 6 + .../00-系统当前状态与开发指南.md | 8 +- .../04-开发计划/05-全文复筛前端开发计划.md | 6 + .../05-开发记录/2025-01-23_全文复筛前端开发完成.md | 6 + .../05-开发记录/2025-01-23_全文复筛前端逻辑调整.md | 6 + .../05-开发记录/2025-11-23_Day5_全文复筛API开发.md | 6 + .../DC-数据清洗整理/00-工具C当前状态与开发指南.md | 145 ++- .../DC-数据清洗整理/00-模块当前状态与开发指南.md | 17 +- .../04-开发计划/工具C_AI_Few-shot示例库.md | 6 + .../04-开发计划/工具C_Bug修复总结_2025-12-08.md | 6 + .../04-开发计划/工具C_Day3开发计划.md | 6 + .../04-开发计划/工具C_Day4-5前端开发计划.md | 6 + .../04-开发计划/工具C_Pivot列顺序优化总结.md | 6 + .../04-开发计划/工具C_方案B实施总结_2025-12-09.md | 6 + .../04-开发计划/工具C_缺失值处理_开发进度_2025-12-10.md | 6 + .../04-开发计划/工具C_缺失值处理功能_更新说明.md | 6 + .../06-开发记录/2025-12-02_工作总结.md | 6 + .../06-开发记录/2025-12-06_工具C_Day1开发完成总结.md | 6 + .../06-开发记录/2025-12-06_工具C_Day2开发完成总结.md | 6 + .../06-开发记录/2025-12-07_AI对话核心功能增强总结.md | 6 + .../2025-12-07_Bug修复_DataGrid空数据防御.md | 6 + .../06-开发记录/2025-12-07_Day5_Ant-Design-X重构完成.md | 6 + .../06-开发记录/2025-12-07_Day5最终总结.md | 6 + .../06-开发记录/2025-12-07_UI优化与Bug修复.md | 6 + .../06-开发记录/2025-12-07_后端API完整对接完成.md | 6 + .../06-开发记录/2025-12-07_完整UI优化与功能增强.md | 6 + .../06-开发记录/2025-12-07_工具C_Day4前端基础完成.md | 6 + .../06-开发记录/DC模块重建完成总结-Day1.md | 6 + .../06-开发记录/Phase1-Portal页面开发完成-2025-12-02.md | 6 + .../Phase2-ToolB-Step1-2开发完成-2025-12-03.md | 6 + .../06-开发记录/Portal页面UI优化-2025-12-02.md | 6 + .../06-开发记录/Tool-B-MVP完成总结-2025-12-03.md | 6 + .../06-开发记录/ToolB-UI优化-2025-12-03.md | 6 + .../06-开发记录/ToolB-UI优化-Round2-2025-12-03.md | 6 + .../06-开发记录/ToolB浏览器测试计划-2025-12-03.md | 6 + .../06-开发记录/后端API测试报告-2025-12-02.md | 6 + .../06-开发记录/待办事项-下一步工作.md | 6 + .../06-开发记录/数据库验证报告-2025-12-02.md | 6 + .../07-技术债务/Tool-B技术债务清单.md | 6 + docs/05-部署文档/01-部署架构设计.md | 52 - .../02-SAE部署完全指南(产品经理版).md | 6 + docs/05-部署文档/03-Dify-ECS部署完全指南.md | 4 +- .../04-Python微服务-SAE容器部署指南.md | 30 +- .../05-Node.js后端-SAE容器部署指南.md | 56 +- .../06-前端Nginx-SAE容器部署指南.md | 50 +- docs/05-部署文档/文档修正报告-20251214.md | 6 + docs/07-运维文档/03-SAE环境变量配置指南.md | 6 + .../05-Redis缓存与队列的区别说明.md | 6 + docs/07-运维文档/06-长时间任务可靠性分析.md | 6 + .../07-Redis使用需求分析(按模块).md | 6 + .../2025-12-13-Postgres-Only架构改造完成.md | 6 + .../05-技术债务/通用对话服务抽取计划.md | 6 + extraction_service/main.py | 573 +++++++++++ extraction_service/operations/__init__.py | 6 + extraction_service/operations/dropna.py | 6 + extraction_service/operations/filter.py | 6 + .../operations/metric_time_transform.py | 921 ++++++++++++++++++ extraction_service/operations/unpivot.py | 289 ++++++ extraction_service/test_dc_api.py | 6 + extraction_service/test_execute_simple.py | 6 + extraction_service/test_module.py | 6 + .../asl/components/FulltextDetailDrawer.tsx | 6 + .../modules/asl/hooks/useFulltextResults.ts | 6 + .../src/modules/asl/hooks/useFulltextTask.ts | 6 + .../src/modules/asl/pages/FulltextResults.tsx | 6 + frontend-v2/src/modules/dc/hooks/useAssets.ts | 6 + .../src/modules/dc/hooks/useRecentTasks.ts | 6 + .../components/BinningDialog_improved.tsx | 6 + .../pages/tool-c/components/DropnaDialog.tsx | 6 + .../tool-c/components/MetricTimePanel.tsx | 401 ++++++++ .../tool-c/components/MultiMetricPanel.tsx | 530 ++++++++++ .../dc/pages/tool-c/components/PivotPanel.tsx | 287 ++++++ .../dc/pages/tool-c/components/Toolbar.tsx | 2 +- .../tool-c/components/TransformDialog.tsx | 111 +++ .../pages/tool-c/components/UnpivotPanel.tsx | 392 ++++++++ .../src/modules/dc/pages/tool-c/index.tsx | 4 +- .../modules/dc/pages/tool-c/types/index.ts | 6 + frontend-v2/src/modules/dc/types/portal.ts | 6 + frontend-v2/src/shared/components/index.ts | 6 + python-microservice/operations/__init__.py | 6 + python-microservice/operations/binning.py | 6 + python-microservice/operations/filter.py | 6 + python-microservice/operations/recode.py | 6 + recover_dc_code.py | 6 + run_recovery.ps1 | 6 + tests/QUICKSTART_快速开始.md | 6 + tests/README_测试说明.md | 6 + tests/run_tests.bat | 6 + tests/run_tests.sh | 6 + 快速部署到SAE.md | 6 + 部署检查清单.md | 6 + 123 files changed, 4781 insertions(+), 150 deletions(-) delete mode 100644 docs/05-部署文档/01-部署架构设计.md create mode 100644 extraction_service/operations/metric_time_transform.py create mode 100644 extraction_service/operations/unpivot.py create mode 100644 frontend-v2/src/modules/dc/pages/tool-c/components/MetricTimePanel.tsx create mode 100644 frontend-v2/src/modules/dc/pages/tool-c/components/MultiMetricPanel.tsx create mode 100644 frontend-v2/src/modules/dc/pages/tool-c/components/PivotPanel.tsx create mode 100644 frontend-v2/src/modules/dc/pages/tool-c/components/TransformDialog.tsx create mode 100644 frontend-v2/src/modules/dc/pages/tool-c/components/UnpivotPanel.tsx diff --git a/DC模块代码恢复指南.md b/DC模块代码恢复指南.md index c6a75611..26999dff 100644 --- a/DC模块代码恢复指南.md +++ b/DC模块代码恢复指南.md @@ -230,6 +230,12 @@ + + + + + + diff --git a/backend/migrations/add_data_stats_to_tool_c_session.sql b/backend/migrations/add_data_stats_to_tool_c_session.sql index 770740b0..0fcebe06 100644 --- a/backend/migrations/add_data_stats_to_tool_c_session.sql +++ b/backend/migrations/add_data_stats_to_tool_c_session.sql @@ -32,3 +32,9 @@ WHERE table_schema = 'dc_schema' + + + + + + diff --git a/backend/prisma/manual-migrations/001_add_postgres_cache_and_checkpoint.sql b/backend/prisma/manual-migrations/001_add_postgres_cache_and_checkpoint.sql index 66277b0c..bd21c3ca 100644 --- a/backend/prisma/manual-migrations/001_add_postgres_cache_and_checkpoint.sql +++ b/backend/prisma/manual-migrations/001_add_postgres_cache_and_checkpoint.sql @@ -70,3 +70,9 @@ ORDER BY ordinal_position; + + + + + + diff --git a/backend/prisma/manual-migrations/run-migration-002.ts b/backend/prisma/manual-migrations/run-migration-002.ts index d3e5c143..861fc98e 100644 --- a/backend/prisma/manual-migrations/run-migration-002.ts +++ b/backend/prisma/manual-migrations/run-migration-002.ts @@ -83,3 +83,9 @@ runMigration() }); + + + + + + diff --git a/backend/prisma/migrations/20251208_add_column_mapping/migration.sql b/backend/prisma/migrations/20251208_add_column_mapping/migration.sql index 93e7735a..d3af3c68 100644 --- a/backend/prisma/migrations/20251208_add_column_mapping/migration.sql +++ b/backend/prisma/migrations/20251208_add_column_mapping/migration.sql @@ -17,3 +17,9 @@ COMMENT ON COLUMN "dc_schema"."dc_tool_c_sessions"."column_mapping" IS '列名 + + + + + + diff --git a/backend/prisma/migrations/create_tool_c_session.sql b/backend/prisma/migrations/create_tool_c_session.sql index 474a45f2..2e8c5402 100644 --- a/backend/prisma/migrations/create_tool_c_session.sql +++ b/backend/prisma/migrations/create_tool_c_session.sql @@ -44,3 +44,9 @@ COMMENT ON COLUMN dc_schema.dc_tool_c_sessions.expires_at IS '过期时间(创 + + + + + + diff --git a/backend/recover-code-from-cursor-db.js b/backend/recover-code-from-cursor-db.js index 462b9ac6..866952f4 100644 --- a/backend/recover-code-from-cursor-db.js +++ b/backend/recover-code-from-cursor-db.js @@ -188,6 +188,12 @@ function extractCodeBlocks(obj, blocks = []) { + + + + + + diff --git a/backend/scripts/check-dc-tables.mjs b/backend/scripts/check-dc-tables.mjs index b60374d8..687cf872 100644 --- a/backend/scripts/check-dc-tables.mjs +++ b/backend/scripts/check-dc-tables.mjs @@ -210,6 +210,12 @@ checkDCTables(); + + + + + + diff --git a/backend/scripts/create-tool-c-ai-history-table.mjs b/backend/scripts/create-tool-c-ai-history-table.mjs index 9c643c23..8d31d979 100644 --- a/backend/scripts/create-tool-c-ai-history-table.mjs +++ b/backend/scripts/create-tool-c-ai-history-table.mjs @@ -165,3 +165,9 @@ createAiHistoryTable() + + + + + + diff --git a/backend/scripts/create-tool-c-table.js b/backend/scripts/create-tool-c-table.js index 02e72f7b..dc1cc43d 100644 --- a/backend/scripts/create-tool-c-table.js +++ b/backend/scripts/create-tool-c-table.js @@ -152,3 +152,9 @@ createToolCTable() + + + + + + diff --git a/backend/scripts/create-tool-c-table.mjs b/backend/scripts/create-tool-c-table.mjs index 8ceca8dc..a937cd40 100644 --- a/backend/scripts/create-tool-c-table.mjs +++ b/backend/scripts/create-tool-c-table.mjs @@ -149,3 +149,9 @@ createToolCTable() + + + + + + diff --git a/backend/src/common/jobs/utils.ts b/backend/src/common/jobs/utils.ts index f2881c05..ef608de5 100644 --- a/backend/src/common/jobs/utils.ts +++ b/backend/src/common/jobs/utils.ts @@ -281,3 +281,9 @@ export function getBatchItems( + + + + + + diff --git a/backend/src/modules/asl/fulltext-screening/__tests__/api-integration-test.ts b/backend/src/modules/asl/fulltext-screening/__tests__/api-integration-test.ts index 071209b4..560ec243 100644 --- a/backend/src/modules/asl/fulltext-screening/__tests__/api-integration-test.ts +++ b/backend/src/modules/asl/fulltext-screening/__tests__/api-integration-test.ts @@ -310,6 +310,12 @@ runTests().catch((error) => { + + + + + + diff --git a/backend/src/modules/asl/fulltext-screening/__tests__/e2e-real-test-v2.ts b/backend/src/modules/asl/fulltext-screening/__tests__/e2e-real-test-v2.ts index db4faa50..ffae80ee 100644 --- a/backend/src/modules/asl/fulltext-screening/__tests__/e2e-real-test-v2.ts +++ b/backend/src/modules/asl/fulltext-screening/__tests__/e2e-real-test-v2.ts @@ -251,6 +251,12 @@ runTest() + + + + + + diff --git a/backend/src/modules/asl/fulltext-screening/__tests__/fulltext-screening-api.http b/backend/src/modules/asl/fulltext-screening/__tests__/fulltext-screening-api.http index 7af78195..2519d610 100644 --- a/backend/src/modules/asl/fulltext-screening/__tests__/fulltext-screening-api.http +++ b/backend/src/modules/asl/fulltext-screening/__tests__/fulltext-screening-api.http @@ -289,6 +289,12 @@ Content-Type: application/json + + + + + + diff --git a/backend/src/modules/asl/fulltext-screening/services/ExcelExporter.ts b/backend/src/modules/asl/fulltext-screening/services/ExcelExporter.ts index 63adb20f..2af10833 100644 --- a/backend/src/modules/asl/fulltext-screening/services/ExcelExporter.ts +++ b/backend/src/modules/asl/fulltext-screening/services/ExcelExporter.ts @@ -368,6 +368,12 @@ export class ExcelExporter { + + + + + + diff --git a/backend/src/modules/dc/tool-b/services/ConflictDetectionService.ts b/backend/src/modules/dc/tool-b/services/ConflictDetectionService.ts index 779b3cd9..5ab6da2a 100644 --- a/backend/src/modules/dc/tool-b/services/ConflictDetectionService.ts +++ b/backend/src/modules/dc/tool-b/services/ConflictDetectionService.ts @@ -225,6 +225,12 @@ export const conflictDetectionService = new ConflictDetectionService(); + + + + + + diff --git a/backend/src/modules/dc/tool-b/services/TemplateService.ts b/backend/src/modules/dc/tool-b/services/TemplateService.ts index 6dbd3b82..045a9c6c 100644 --- a/backend/src/modules/dc/tool-b/services/TemplateService.ts +++ b/backend/src/modules/dc/tool-b/services/TemplateService.ts @@ -253,6 +253,12 @@ export const templateService = new TemplateService(); + + + + + + diff --git a/backend/src/modules/dc/tool-b/workers/extractionWorker.ts b/backend/src/modules/dc/tool-b/workers/extractionWorker.ts index 12c27e11..af5a4f30 100644 --- a/backend/src/modules/dc/tool-b/workers/extractionWorker.ts +++ b/backend/src/modules/dc/tool-b/workers/extractionWorker.ts @@ -390,3 +390,9 @@ async function countCompletedBatches(taskId: string): Promise { } + + + + + + diff --git a/backend/src/modules/dc/tool-c/README.md b/backend/src/modules/dc/tool-c/README.md index 5714560f..b7a7f799 100644 --- a/backend/src/modules/dc/tool-c/README.md +++ b/backend/src/modules/dc/tool-c/README.md @@ -182,3 +182,9 @@ curl -X POST http://localhost:3000/api/v1/dc/tool-c/test/execute \ + + + + + + diff --git a/backend/src/modules/dc/tool-c/controllers/QuickActionController.ts b/backend/src/modules/dc/tool-c/controllers/QuickActionController.ts index b95e078e..75979f0e 100644 --- a/backend/src/modules/dc/tool-c/controllers/QuickActionController.ts +++ b/backend/src/modules/dc/tool-c/controllers/QuickActionController.ts @@ -25,7 +25,7 @@ import { prisma } from '../../../../config/database.js'; interface QuickActionRequest { sessionId: string; - action: 'filter' | 'recode' | 'binning' | 'conditional' | 'dropna' | 'dedup'; + action: 'filter' | 'recode' | 'binning' | 'conditional' | 'dropna' | 'dedup' | 'compute' | 'pivot' | 'unpivot' | 'metric_time' | 'multi_metric_to_long' | 'multi_metric_to_matrix'; params: any; userId?: string; } @@ -105,6 +105,18 @@ export class QuickActionController { case 'pivot': actionDescription = 'Pivot转换'; break; + case 'unpivot': + actionDescription = 'Unpivot转换(宽→长表)'; + break; + case 'metric_time': + actionDescription = '指标-时间表转换'; + break; + case 'multi_metric_to_long': + actionDescription = '多指标转长表'; + break; + case 'multi_metric_to_matrix': + actionDescription = '多指标转矩阵'; + break; default: logger.warn(`[QuickAction] 不支持的操作: ${action}`); return reply.code(400).send({ @@ -184,6 +196,22 @@ export class QuickActionController { pivotValueOrder ); break; + case 'unpivot': + // Unpivot不需要columnMapping,直接执行 + executeResult = await quickActionService.executeUnpivot(fullData, params); + break; + case 'metric_time': + // 指标-时间表转换 + executeResult = await quickActionService.executeMetricTime(fullData, params); + break; + case 'multi_metric_to_long': + // 多指标转长表 + executeResult = await quickActionService.executeMultiMetricToLong(fullData, params); + break; + case 'multi_metric_to_matrix': + // 多指标转矩阵 + executeResult = await quickActionService.executeMultiMetricToMatrix(fullData, params); + break; } if (!executeResult.success) { @@ -340,9 +368,27 @@ export class QuickActionController { case 'dropna': executeResult = await quickActionService.executeDropna(fullData, params); break; + case 'compute': + executeResult = await quickActionService.executeCompute(fullData, params); + break; + case 'dedup': + // TODO: 实现去重功能 + return reply.code(400).send({ success: false, error: '去重功能尚未实现' }); case 'pivot': executeResult = await quickActionService.executePivot(fullData, params); break; + case 'unpivot': + executeResult = await quickActionService.executeUnpivot(fullData, params); + break; + case 'metric_time': + executeResult = await quickActionService.executeMetricTime(fullData, params); + break; + case 'multi_metric_to_long': + executeResult = await quickActionService.executeMultiMetricToLong(fullData, params); + break; + case 'multi_metric_to_matrix': + executeResult = await quickActionService.executeMultiMetricToMatrix(fullData, params); + break; default: return reply.code(400).send({ success: false, error: '不支持的操作' }); } @@ -361,14 +407,29 @@ export class QuickActionController { const newRows = resultData.length; let estimatedChange = ''; - if (action === 'filter' || action === 'dropna') { - estimatedChange = `将保留 ${newRows} 行(删除 ${originalRows - newRows} 行)`; - } else if (action === 'recode' || action === 'binning' || action === 'conditional' || action === 'compute') { - estimatedChange = `将新增 1 列`; - } else if (action === 'pivot') { - const originalCols = Object.keys(fullData[0] || {}).length; - const newCols = Object.keys(resultData[0] || {}).length; - estimatedChange = `行数: ${originalRows} → ${newRows}, 列数: ${originalCols} → ${newCols}`; + switch (action) { + case 'filter': + case 'dropna': + estimatedChange = `将保留 ${newRows} 行(删除 ${originalRows - newRows} 行)`; + break; + case 'recode': + case 'binning': + case 'conditional': + case 'compute': + estimatedChange = `将新增 1 列`; + break; + case 'pivot': + case 'unpivot': + case 'metric_time': + case 'multi_metric_to_long': + case 'multi_metric_to_matrix': { + const originalCols = Object.keys(fullData[0] || {}).length; + const newCols = Object.keys(resultData[0] || {}).length; + estimatedChange = `行数: ${originalRows} → ${newRows}, 列数: ${originalCols} → ${newCols}`; + break; + } + default: + estimatedChange = `操作完成`; } return reply.code(200).send({ @@ -541,6 +602,95 @@ export class QuickActionController { }); } } + + /** + * POST /api/v1/dc/tool-c/metric-time/detect + * 检测指标-时间表转换模式 + */ + async handleMetricTimeDetect(request: FastifyRequest, reply: FastifyReply) { + try { + const { sessionId, valueVars } = request.body as { sessionId: string; valueVars: string[] }; + + logger.info(`[QuickAction] 检测指标-时间表模式: session=${sessionId}, ${valueVars?.length || 0} 列`); + + // 验证参数 + if (!valueVars || valueVars.length < 2) { + return reply.code(400).send({ + success: false, + error: '至少需要2列才能检测模式' + }); + } + + // 调用Service检测模式 + const result = await quickActionService.detectMetricTimePattern(valueVars); + + if (!result.success) { + return reply.code(500).send({ + success: false, + error: result.error || '模式检测失败' + }); + } + + return reply.code(200).send({ + success: true, + pattern: result.pattern, + execution_time: result.execution_time + }); + + } catch (error: any) { + logger.error(`[QuickAction] 模式检测失败: ${error.message}`); + return reply.code(500).send({ + success: false, + error: error.message + }); + } + } + + /** + * POST /api/v1/dc/tool-c/multi-metric/detect + * 检测多指标分组 + */ + async handleMultiMetricDetect(request: FastifyRequest, reply: FastifyReply) { + try { + const { sessionId, valueVars, separators } = request.body as { + sessionId: string; + valueVars: string[]; + separators?: string[]; + }; + + logger.info(`[QuickAction] 检测多指标分组: session=${sessionId}, ${valueVars?.length || 0} 列`); + + // 验证参数 + if (!valueVars || valueVars.length < 2) { + return reply.code(400).send({ + success: false, + error: '至少需要2列才能检测分组' + }); + } + + // 调用Service检测分组 + const result = await quickActionService.detectMultiMetricGroups(valueVars, separators); + + if (!result.success) { + return reply.code(500).send({ + success: false, + error: result.message || '分组检测失败' + }); + } + + return reply.code(200).send({ + success: true, + grouping: result + }); + + } catch (error: any) { + logger.error(`[QuickAction] 多指标分组检测失败: ${error.message}`); + return reply.code(500).send({ + success: false, + error: error.message + }); + } + } } // ==================== 导出单例 ==================== diff --git a/backend/src/modules/dc/tool-c/controllers/StreamAIController.ts b/backend/src/modules/dc/tool-c/controllers/StreamAIController.ts index 12c33e92..61f63de3 100644 --- a/backend/src/modules/dc/tool-c/controllers/StreamAIController.ts +++ b/backend/src/modules/dc/tool-c/controllers/StreamAIController.ts @@ -236,3 +236,9 @@ export const streamAIController = new StreamAIController(); + + + + + + diff --git a/backend/src/modules/dc/tool-c/routes/index.ts b/backend/src/modules/dc/tool-c/routes/index.ts index 71a57ce9..13ba3c72 100644 --- a/backend/src/modules/dc/tool-c/routes/index.ts +++ b/backend/src/modules/dc/tool-c/routes/index.ts @@ -133,5 +133,19 @@ export async function toolCRoutes(fastify: FastifyInstance) { fastify.post('/fillna/mice', { handler: quickActionController.handleFillnaMice.bind(quickActionController), }); + + // ✨ 指标-时间表转换(新增) + + // 检测指标-时间表转换模式 + fastify.post('/metric-time/detect', { + handler: quickActionController.handleMetricTimeDetect.bind(quickActionController), + }); + + // ✨ 多指标转换(新增) + + // 检测多指标分组 + fastify.post('/multi-metric/detect', { + handler: quickActionController.handleMultiMetricDetect.bind(quickActionController), + }); } diff --git a/backend/src/modules/dc/tool-c/services/QuickActionService.ts b/backend/src/modules/dc/tool-c/services/QuickActionService.ts index c3c64900..0cd051cf 100644 --- a/backend/src/modules/dc/tool-c/services/QuickActionService.ts +++ b/backend/src/modules/dc/tool-c/services/QuickActionService.ts @@ -77,6 +77,49 @@ interface PivotParams { unusedAggMethod?: 'first' | 'mode' | 'mean'; // ✨ 新增:未选择列的聚合方式 } +interface UnpivotParams { + idVars: string[]; // ID列(保持不变的列) + valueVars: string[]; // 值列(需要转换的列) + varName: string; // 变量名列名 + valueName: string; // 值列名 + parseColumnNames?: boolean; // 是否解析列名 + separator?: string; // 分隔符 + metricName?: string; // 指标列名 + timeName?: string; // 时间列名 + dropna?: boolean; // 是否删除缺失值行 +} + +interface MetricTimeParams { + idVars: string[]; // ID列(保持不变的列) + valueVars: string[]; // 值列(同一指标的多个时间点) + metricName?: string; // 指标名称(可选,自动检测) + separator?: string; // 分隔符(可选,自动检测) + timepointColName?: string; // 时间点列名 +} + +interface MultiMetricToLongParams { + idVars: string[]; // ID列 + valueVars: string[]; // 值列(多个指标的多个时间点) + separators?: string[]; // 可选的分隔符列表 + eventColName?: string; // 时间点列名(默认 'Event_Name') +} + +interface MultiMetricToMatrixParams { + idVars: string[]; // ID列 + valueVars: string[]; // 值列(多个指标的多个时间点) + separators?: string[]; // 可选的分隔符列表 + metricColName?: string; // 指标列名(默认 '指标名') +} + +interface MetricGrouping { + success: boolean; + metric_groups?: Record; // 指标分组 + separator?: string; // 检测到的分隔符 + timepoints?: string[]; // 时间点列表 + confidence?: number; // 置信度 + message?: string; +} + interface FillnaSimpleParams { column: string; newColumnName: string; @@ -100,6 +143,7 @@ interface OperationResult { error?: string; message?: string; stats?: any; + pattern?: any; // ✨ 新增:用于指标-时间表模式检测 } // ==================== 服务类 ==================== @@ -359,6 +403,209 @@ export class QuickActionService { } } + /** + * 执行Unpivot(宽表转长表) + */ + async executeUnpivot(data: any[], params: UnpivotParams): Promise { + try { + logger.info(`[QuickActionService] 调用Unpivot API: ${params.idVars.length} ID列 × ${params.valueVars.length} 值列`); + + const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/unpivot`, { + data, + id_vars: params.idVars, + value_vars: params.valueVars, + var_name: params.varName || '变量', + value_name: params.valueName || '值', + parse_column_names: params.parseColumnNames || false, + separator: params.separator || '_', + metric_name: params.metricName, + time_name: params.timeName, + dropna: params.dropna || false, + }, { + timeout: 60000, + }); + + logger.info(`[QuickActionService] Unpivot成功: ${response.data.result_shape?.[0] || 0} 行`); + return response.data; + + } catch (error: any) { + logger.error(`[QuickActionService] Unpivot失败: ${error.message}`); + + if (error.response?.data) { + return error.response.data; + } + + return { + success: false, + error: error.message || 'Unpivot失败', + }; + } + } + + /** + * 检测指标-时间表转换模式 + */ + async detectMetricTimePattern(valueVars: string[]): Promise { + try { + logger.info(`[QuickActionService] 检测指标-时间表模式: ${valueVars.length} 列`); + + const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/metric-time/detect`, { + value_vars: valueVars, + }, { + timeout: 10000, + }); + + logger.info(`[QuickActionService] 模式检测成功`); + return response.data; + + } catch (error: any) { + logger.error(`[QuickActionService] 模式检测失败: ${error.message}`); + + if (error.response?.data) { + return error.response.data; + } + + return { + success: false, + error: error.message || '模式检测失败', + }; + } + } + + /** + * 执行指标-时间表转换 + */ + async executeMetricTime(data: any[], params: MetricTimeParams): Promise { + try { + logger.info(`[QuickActionService] 调用指标-时间表转换API: ${params.idVars.length} ID列 × ${params.valueVars.length} 值列`); + + const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/metric-time`, { + data, + id_vars: params.idVars, + value_vars: params.valueVars, + metric_name: params.metricName, + separator: params.separator, + timepoint_col_name: params.timepointColName || '时间点', + }, { + timeout: 60000, + }); + + logger.info(`[QuickActionService] 指标-时间表转换成功: ${response.data.result_shape?.[0] || 0} 行`); + return response.data; + + } catch (error: any) { + logger.error(`[QuickActionService] 指标-时间表转换失败: ${error.message}`); + + if (error.response?.data) { + return error.response.data; + } + + return { + success: false, + error: error.message || '指标-时间表转换失败', + }; + } + } + + /** + * 检测多指标分组 + */ + async detectMultiMetricGroups(valueVars: string[], separators?: string[]): Promise { + try { + logger.info(`[QuickActionService] 调用多指标分组检测API: ${valueVars.length} 列`); + + const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/multi-metric/detect`, { + value_vars: valueVars, + separators: separators, + }, { + timeout: 10000, + }); + + logger.info(`[QuickActionService] 多指标分组检测成功: ${Object.keys(response.data.metric_groups || {}).length} 个指标`); + return response.data; + + } catch (error: any) { + logger.error(`[QuickActionService] 多指标分组检测失败: ${error.message}`); + + if (error.response?.data) { + return error.response.data; + } + + return { + success: false, + message: error.message || '多指标分组检测失败', + }; + } + } + + /** + * 执行多指标转长表(时间点为行,指标为列) + */ + async executeMultiMetricToLong(data: any[], params: MultiMetricToLongParams): Promise { + try { + logger.info(`[QuickActionService] 调用多指标转长表API: ${params.idVars.length} ID列 × ${params.valueVars.length} 值列`); + + const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/multi-metric/to-long`, { + data, + id_vars: params.idVars, + value_vars: params.valueVars, + separators: params.separators, + event_col_name: params.eventColName || 'Event_Name', + }, { + timeout: 60000, + }); + + logger.info(`[QuickActionService] 多指标转长表成功: ${response.data.result_shape?.[0] || 0} 行`); + return response.data; + + } catch (error: any) { + logger.error(`[QuickActionService] 多指标转长表失败: ${error.message}`); + + if (error.response?.data) { + return error.response.data; + } + + return { + success: false, + error: error.message || '多指标转长表失败', + }; + } + } + + /** + * 执行多指标转矩阵(时间点为列,指标为行) + */ + async executeMultiMetricToMatrix(data: any[], params: MultiMetricToMatrixParams): Promise { + try { + logger.info(`[QuickActionService] 调用多指标转矩阵API: ${params.idVars.length} ID列 × ${params.valueVars.length} 值列`); + + const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/multi-metric/to-matrix`, { + data, + id_vars: params.idVars, + value_vars: params.valueVars, + separators: params.separators, + metric_col_name: params.metricColName || '指标名', + }, { + timeout: 60000, + }); + + logger.info(`[QuickActionService] 多指标转矩阵成功: ${response.data.result_shape?.[0] || 0} 行`); + return response.data; + + } catch (error: any) { + logger.error(`[QuickActionService] 多指标转矩阵失败: ${error.message}`); + + if (error.response?.data) { + return error.response.data; + } + + return { + success: false, + error: error.message || '多指标转矩阵失败', + }; + } + } + /** * 获取列的缺失值统计 */ @@ -463,3 +710,21 @@ export class QuickActionService { export const quickActionService = new QuickActionService(); +// ==================== 导出类型 ==================== + +export type { + FilterParams, + RecodeParams, + BinningParams, + ConditionalParams, + PivotParams, + UnpivotParams, + MetricTimeParams, + MultiMetricToLongParams, + MultiMetricToMatrixParams, + MetricGrouping, + FillnaSimpleParams, + FillnaMiceParams, + OperationResult, +}; + diff --git a/backend/src/tests/README.md b/backend/src/tests/README.md index 67cf06c5..c697bcd1 100644 --- a/backend/src/tests/README.md +++ b/backend/src/tests/README.md @@ -382,3 +382,9 @@ SET session_replication_role = 'origin'; **作者:** AI Clinical Research Team + + + + + + diff --git a/backend/src/tests/verify-test1-database.sql b/backend/src/tests/verify-test1-database.sql index bed79415..3a105cbf 100644 --- a/backend/src/tests/verify-test1-database.sql +++ b/backend/src/tests/verify-test1-database.sql @@ -84,3 +84,9 @@ WHERE key = 'verify_test'; \echo '==========================================' + + + + + + diff --git a/backend/src/tests/verify-test1-database.ts b/backend/src/tests/verify-test1-database.ts index f568f2a6..6767b598 100644 --- a/backend/src/tests/verify-test1-database.ts +++ b/backend/src/tests/verify-test1-database.ts @@ -227,3 +227,9 @@ verifyDatabase() }); + + + + + + diff --git a/backend/src/types/global.d.ts b/backend/src/types/global.d.ts index 225f8ff2..59b6f264 100644 --- a/backend/src/types/global.d.ts +++ b/backend/src/types/global.d.ts @@ -17,3 +17,9 @@ export {} + + + + + + diff --git a/backend/sync-dc-database.ps1 b/backend/sync-dc-database.ps1 index cd506af2..c6b037e3 100644 --- a/backend/sync-dc-database.ps1 +++ b/backend/sync-dc-database.ps1 @@ -33,6 +33,12 @@ Write-Host "✅ 完成!" -ForegroundColor Green + + + + + + diff --git a/backend/test-tool-c-advanced-scenarios.mjs b/backend/test-tool-c-advanced-scenarios.mjs index 090d8f2b..96b950c7 100644 --- a/backend/test-tool-c-advanced-scenarios.mjs +++ b/backend/test-tool-c-advanced-scenarios.mjs @@ -327,3 +327,9 @@ runAdvancedTests().catch(error => { + + + + + + diff --git a/backend/test-tool-c-day2.mjs b/backend/test-tool-c-day2.mjs index 1b317916..57087df9 100644 --- a/backend/test-tool-c-day2.mjs +++ b/backend/test-tool-c-day2.mjs @@ -393,3 +393,9 @@ runAllTests() + + + + + + diff --git a/backend/test-tool-c-day3.mjs b/backend/test-tool-c-day3.mjs index a5699f44..6b5bdfbe 100644 --- a/backend/test-tool-c-day3.mjs +++ b/backend/test-tool-c-day3.mjs @@ -351,3 +351,9 @@ runAllTests() + + + + + + diff --git a/deploy-to-sae.ps1 b/deploy-to-sae.ps1 index f0a6b7ba..be945589 100644 --- a/deploy-to-sae.ps1 +++ b/deploy-to-sae.ps1 @@ -135,3 +135,9 @@ Set-Location .. + + + + + + diff --git a/docs/00-系统总体设计/00-系统当前状态与开发指南.md b/docs/00-系统总体设计/00-系统当前状态与开发指南.md index 3eee53d2..e6f8caa4 100644 --- a/docs/00-系统总体设计/00-系统当前状态与开发指南.md +++ b/docs/00-系统总体设计/00-系统当前状态与开发指南.md @@ -1,10 +1,10 @@ # AIclinicalresearch 系统当前状态与开发指南 -> **文档版本:** v1.8 +> **文档版本:** v1.9 > **创建日期:** 2025-11-28 > **维护者:** 开发团队 -> **最后更新:** 2025-12-13 -> **重大进展:** 🏆 **Postgres-Only 架构改造完成(Phase 1-7)** - Platform层统一任务管理、智能双模式处理、断点续传机制 +> **最后更新:** 2025-12-21 +> **重大进展:** ✨ **DC模块多指标转换功能上线(方向1+2)** - 医学研究专用的重复测量数据转换工具 > **文档目的:** 快速了解系统当前状态,为新AI助手提供上下文 --- @@ -40,7 +40,7 @@ | **AIA** | AI智能问答 | 10+专业智能体(选题评价、PICO梳理等) | ⭐⭐⭐⭐ | ✅ 已完成 | P1 | | **PKB** | 个人知识库 | RAG问答、私人文献库 | ⭐⭐⭐ | ✅ 已完成 | P1 | | **ASL** | AI智能文献 | 文献筛选、Meta分析、证据图谱 | ⭐⭐⭐⭐⭐ | 🚧 **正在开发** | **P0** | -| **DC** | 数据清洗整理 | ETL + 医学NER(百万行级数据) | ⭐⭐⭐⭐⭐ | ✅ **Tool B完成 + Tool C 98%(7个功能+NA处理+Pivot优化+UX重大改进)** | **P0** | +| **DC** | 数据清洗整理 | ETL + 医学NER(百万行级数据) | ⭐⭐⭐⭐⭐ | ✅ **Tool B完成 + Tool C 99%(7个功能+NA处理+Pivot优化+UX重大改进+多指标转换)** | **P0** | | **SSA** | 智能统计分析 | 队列/预测模型/RCT分析 | ⭐⭐⭐⭐⭐ | 📋 规划中 | P2 | | **ST** | 统计分析工具 | 100+轻量化统计工具 | ⭐⭐⭐⭐ | 📋 规划中 | P2 | | **RVW** | 稿件审查系统 | 方法学评估、审稿流程 | ⭐⭐⭐⭐ | 📋 规划中 | P3 | diff --git a/docs/03-业务模块/ASL-AI智能文献/04-开发计划/05-全文复筛前端开发计划.md b/docs/03-业务模块/ASL-AI智能文献/04-开发计划/05-全文复筛前端开发计划.md index 8de927d8..c2dc7ef3 100644 --- a/docs/03-业务模块/ASL-AI智能文献/04-开发计划/05-全文复筛前端开发计划.md +++ b/docs/03-业务模块/ASL-AI智能文献/04-开发计划/05-全文复筛前端开发计划.md @@ -1250,6 +1250,12 @@ interface FulltextScreeningResult { + + + + + + diff --git a/docs/03-业务模块/ASL-AI智能文献/05-开发记录/2025-01-23_全文复筛前端开发完成.md b/docs/03-业务模块/ASL-AI智能文献/05-开发记录/2025-01-23_全文复筛前端开发完成.md index 20c31419..307d080f 100644 --- a/docs/03-业务模块/ASL-AI智能文献/05-开发记录/2025-01-23_全文复筛前端开发完成.md +++ b/docs/03-业务模块/ASL-AI智能文献/05-开发记录/2025-01-23_全文复筛前端开发完成.md @@ -364,6 +364,12 @@ GET /api/v1/asl/fulltext-screening/tasks/:taskId/export + + + + + + diff --git a/docs/03-业务模块/ASL-AI智能文献/05-开发记录/2025-01-23_全文复筛前端逻辑调整.md b/docs/03-业务模块/ASL-AI智能文献/05-开发记录/2025-01-23_全文复筛前端逻辑调整.md index af3e64b6..4305cf49 100644 --- a/docs/03-业务模块/ASL-AI智能文献/05-开发记录/2025-01-23_全文复筛前端逻辑调整.md +++ b/docs/03-业务模块/ASL-AI智能文献/05-开发记录/2025-01-23_全文复筛前端逻辑调整.md @@ -307,6 +307,12 @@ Linter错误:0个 + + + + + + diff --git a/docs/03-业务模块/ASL-AI智能文献/05-开发记录/2025-11-23_Day5_全文复筛API开发.md b/docs/03-业务模块/ASL-AI智能文献/05-开发记录/2025-11-23_Day5_全文复筛API开发.md index 25a51725..4400a5bb 100644 --- a/docs/03-业务模块/ASL-AI智能文献/05-开发记录/2025-11-23_Day5_全文复筛API开发.md +++ b/docs/03-业务模块/ASL-AI智能文献/05-开发记录/2025-11-23_Day5_全文复筛API开发.md @@ -466,6 +466,12 @@ Failed to open file '\\tmp\\extraction_service\\temp_10000_test.pdf' + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/00-工具C当前状态与开发指南.md b/docs/03-业务模块/DC-数据清洗整理/00-工具C当前状态与开发指南.md index d1589903..75a5dc2d 100644 --- a/docs/03-业务模块/DC-数据清洗整理/00-工具C当前状态与开发指南.md +++ b/docs/03-业务模块/DC-数据清洗整理/00-工具C当前状态与开发指南.md @@ -1,8 +1,8 @@ # 工具C(Tool C)- 科研数据编辑器 - 当前状态与开发指南 -> **最后更新**: 2025-12-10 -> **当前版本**: Day 5-8 MVP + 功能按钮 + NA处理 + Pivot优化 + UX重大改进 -> **开发进度**: Python微服务 ✅ | Session管理 ✅ | AI代码生成 ✅ | 前端完整 ✅ | 通用组件 ✅ | 功能按钮✅(7个)| NA处理✅ | Pivot优化✅ | **UX优化✅(筛选/行号/滚动条/全量数据)** +> **最后更新**: 2025-12-21 +> **当前版本**: Day 5-8 MVP + 功能按钮 + NA处理 + Pivot优化 + UX重大改进 + **多指标转换✅** +> **开发进度**: Python微服务 ✅ | Session管理 ✅ | AI代码生成 ✅ | 前端完整 ✅ | 通用组件 ✅ | 功能按钮✅(7个)| NA处理✅ | Pivot优化✅ | UX优化✅ | **多指标转换✅(方向1+2)** --- @@ -21,7 +21,144 @@ --- -## ✅ 已完成功能(Day 1-8) +## ✅ 已完成功能(Day 1-9) + +### 🎉 Day 9 多指标转换功能(2025-12-21)✅ + +#### 1. 功能概述 +**医学研究专用的多指标重复测量数据转换工具**,支持两个转换方向: + +| 转换方向 | 输入格式 | 输出格式 | 适用场景 | +|---------|---------|---------|---------| +| **方向1:分析格式** | 宽表 | 时间点→行,指标→列 | 统计分析、混合效应模型、GEE、数据可视化 | +| **方向2:展示格式** | 宽表 | 时间点→列,指标→行 | 临床报告、数据审查表、CRF核对、单受试者数据审查 | + +#### 2. 核心功能 ✅ + +**2.1 智能自动分组** ✅ +- ✅ 自动检测列名中的指标名称和时间点 +- ✅ 智能识别分隔符(`___`、`__`、`_`、`-`、`.`等) +- ✅ 公共前缀智能扩展(修复"FMA总得分___基线"识别问题) +- ✅ 时间点一致性验证 +- ✅ 置信度评分 + +**示例**: +``` +输入列名:FMA总得分___筛选及基线、FMA总得分___随访(2周)、ADL总分___基线、ADL总分___随访(2周) +自动检测: + ✓ 3个指标:FMA总得分、ADL总分、FM疗效 + ✓ 8个时间点:筛选及基线、随访(2周)、随访(1个月)... + ✓ 分隔符:"___" +``` + +**2.2 方向1:多指标转长表(时间点为行,指标为列)** ✅ +- ✅ 适用场景:R/Python统计分析、ggplot2/seaborn可视化、机器学习 +- ✅ 列顺序优化:`ID列 → Event_Name → 各指标列` +- ✅ 保持原始Record ID顺序 +- ✅ 自动处理缺失值(outer join) + +**示例**: +``` +输入(宽表): +Record_ID | FMA总得分_基线 | FMA总得分_随访1 | ADL总分_基线 | ADL总分_随访1 +4 | 58 | 67 | 40 | 95 +5 | 61 | 79 | 35 | 85 + +输出(长表): +Record_ID | Event_Name | FMA总得分 | ADL总分 +4 | 基线 | 58 | 40 +4 | 随访1 | 67 | 95 +5 | 基线 | 61 | 35 +5 | 随访1 | 79 | 85 +``` + +**2.3 方向2:多指标转矩阵(时间点为列,指标为行)** ✅ +- ✅ 适用场景:临床报告、数据审查、CRF核对 +- ✅ 列顺序优化:`ID列 → 指标名列 → 各时间点列` +- ✅ 保持原始Record ID顺序 +- ✅ 时间点列按原始顺序排列 + +**示例**: +``` +输入(宽表): +Record_ID | FMA总得分_基线 | FMA总得分_随访1 | ADL总分_基线 | ADL总分_随访1 +4 | 58 | 67 | 40 | 95 + +输出(矩阵): +Record_ID | 指标名 | 基线 | 随访1 +4 | FMA总得分 | 58 | 67 +4 | ADL总分 | 40 | 95 +``` + +#### 3. UX优化 ✅ + +| 功能 | 说明 | 状态 | +|------|------|------| +| 转换方向选择 | Radio组件,两个选项,带场景说明 | ✅ | +| 全选/清空按钮 | 快速选择所有值列 | ✅ | +| 实时预览 | 选择列后自动生成预览(前10行) | ✅ | +| 智能表单 | 根据转换方向动态显示不同的输入框 | ✅ | +| 可视化分组结果 | Tag标签展示检测到的指标和时间点 | ✅ | +| 置信度提示 | 检测置信度<1.0时显示警告 | ✅ | + +#### 4. 技术架构 ✅ + +**4.1 Python层(`metric_time_transform.py`)** +- ✅ `detect_metric_groups()` - 自动分组检测(300行) +- ✅ `apply_multi_metric_to_long()` - 方向1转换(150行) +- ✅ `apply_multi_metric_to_matrix()` - 方向2转换(180行) +- ✅ 智能排序:保持原始Record ID顺序 + +**4.2 Python API(`main.py`)** +- ✅ `POST /api/operations/multi-metric/detect` - 检测指标分组 +- ✅ `POST /api/operations/multi-metric/to-long` - 执行方向1转换 +- ✅ `POST /api/operations/multi-metric/to-matrix` - 执行方向2转换 + +**4.3 Node.js Backend** +- ✅ `QuickActionService.ts` - 3个新方法 +- ✅ `QuickActionController.ts` - 支持2个新action +- ✅ 路由注册:`/multi-metric/detect` + +**4.4 Frontend(`MultiMetricPanel.tsx`)** +- ✅ 转换方向选择(Radio组件) +- ✅ 智能表单(动态显示) +- ✅ 实时检测和预览 +- ✅ 完整的错误处理 + +#### 5. 关键技术突破 ✅ + +| 技术点 | 问题 | 解决方案 | +|-------|------|---------| +| 列名识别 | "FMA总得分___基线" 被错误识别为 "FMA" | 智能修正算法:扩展公共前缀 | +| 列顺序 | Event_Name位置随机 | 强制列顺序:ID → Event_Name → 指标 | +| Record ID顺序 | 转换后按字典序排序(4,10,11,5,6) | 添加临时列 `_original_order` 保持原始顺序 | +| 分隔符识别 | 不支持三重下划线 `___` | 优先级列表:`['___', '__', '_', '-', '.']` | +| 时间点提取 | `.lstrip()` 错误移除字符 | 使用 `.startswith()` 精确匹配 | + +#### 6. 测试覆盖 ✅ + +| 测试场景 | 测试数据 | 状态 | +|---------|---------|------| +| 单ID列,多指标 | Record_ID: 4,5,6,10,11 | ✅ | +| 三重下划线分隔符 | `FMA总得分___筛选及基线` | ✅ | +| 括号时间点 | `随访(2周)` | ✅ | +| 中文列名 | `FMA疗效` | ✅ | +| 空值处理 | outer join保留所有时间点 | ✅ | +| 原始顺序保持 | 4→5→6→10→11 | ✅ | + +#### 7. 代码统计 ✅ + +| 文件 | 新增代码 | 说明 | +|------|---------|------| +| `metric_time_transform.py` | ~600行 | Python核心算法 | +| `main.py` | ~150行 | 3个API端点 | +| `QuickActionService.ts` | ~100行 | 3个新方法 | +| `QuickActionController.ts` | ~50行 | Action支持 | +| `MultiMetricPanel.tsx` | ~530行 | 完整UI组件 | +| `TransformDialog.tsx` | ~30行 | Tab集成 | +| **总计** | **~1460行** | **完整功能实现** | + +--- ### 🚀 Day 7-8 NA处理优化 + Pivot列顺序优化(2025-12-09~10) diff --git a/docs/03-业务模块/DC-数据清洗整理/00-模块当前状态与开发指南.md b/docs/03-业务模块/DC-数据清洗整理/00-模块当前状态与开发指南.md index 6427b892..74d850dd 100644 --- a/docs/03-业务模块/DC-数据清洗整理/00-模块当前状态与开发指南.md +++ b/docs/03-业务模块/DC-数据清洗整理/00-模块当前状态与开发指南.md @@ -1,10 +1,10 @@ # DC数据清洗整理模块 - 当前状态与开发指南 -> **文档版本:** v3.2 +> **文档版本:** v3.3 > **创建日期:** 2025-11-28 > **维护者:** DC模块开发团队 -> **最后更新:** 2025-12-13 🏆 **Postgres-Only 架构改造完成!** -> **重大里程碑:** Tool C MVP完成 + Tool B Postgres-Only架构改造(智能双模式、任务拆分、断点续传) +> **最后更新:** 2025-12-21 ✨ **多指标转换功能上线!** +> **重大里程碑:** Tool C MVP完成 + Tool B Postgres-Only架构改造 + **Tool C多指标转换(方向1+2)** > **文档目的:** 反映模块真实状态,记录开发历程 --- @@ -67,17 +67,18 @@ DC数据清洗整理模块提供4个智能工具,帮助研究人员清洗、 - ✅ 断点续传支持(支持长时间提取任务) - ✅ Platform层统一管理(job.data存储) - ✅ Worker注册(extractionWorker.ts) - - ✅ **Tool C 完整实现**(2025-12-06 ~ 2025-12-10): - - ✅ Python微服务(~1800行,Day 1 + NA处理优化 + 全量数据处理) - - ✅ Node.js后端(~3500行,Day 2-3,Day 5-8增强 + 全量返回) - - ✅ 前端界面(~4000行,Day 4-8,筛选/行号/滚动条/全量加载) + - ✅ **Tool C 完整实现**(2025-12-06 ~ 2025-12-21): + - ✅ Python微服务(~2400行,Day 1 + NA处理优化 + 全量数据处理 + 多指标转换) + - ✅ Node.js后端(~3600行,Day 2-3,Day 5-8增强 + 全量返回 + 多指标转换) + - ✅ 前端界面(~4500行,Day 4-8,筛选/行号/滚动条/全量加载 + 多指标转换) - ✅ **通用 Chat 组件**(~968行,Day 5)🎉 - ✅ 7个功能按钮(Day 6) - ✅ NA处理优化(4个功能,Day 7) - ✅ Pivot列顺序优化(Day 7-8) - ✅ 计算列方案B(安全列名映射,Day 7-8) - ✅ **UX重大改进**(列头筛选/行号/滚动条修复/全量数据,Day 8) - - **总计:~13068行** | **完成度:98%** + - ✅ **多指标转换**(方向1+2,智能分组,原始顺序保持,Day 9) + - **总计:~14528行** | **完成度:99%** - **重大成就**: - 🎉 **前端通用能力层建设完成** - ✨ 基于 Ant Design X 的 Chat 组件库 diff --git a/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_AI_Few-shot示例库.md b/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_AI_Few-shot示例库.md index a798e8d6..f924d3ad 100644 --- a/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_AI_Few-shot示例库.md +++ b/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_AI_Few-shot示例库.md @@ -539,3 +539,9 @@ df['creatinine'] = pd.to_numeric(df['creatinine'], errors='coerce') + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_Bug修复总结_2025-12-08.md b/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_Bug修复总结_2025-12-08.md index fc17e19b..4077eb34 100644 --- a/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_Bug修复总结_2025-12-08.md +++ b/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_Bug修复总结_2025-12-08.md @@ -377,3 +377,9 @@ npm run dev + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_Day3开发计划.md b/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_Day3开发计划.md index 9702f939..e3b1b8c0 100644 --- a/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_Day3开发计划.md +++ b/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_Day3开发计划.md @@ -954,3 +954,9 @@ export const aiController = new AIController(); + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_Day4-5前端开发计划.md b/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_Day4-5前端开发计划.md index 194f9692..db059d50 100644 --- a/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_Day4-5前端开发计划.md +++ b/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_Day4-5前端开发计划.md @@ -1288,3 +1288,9 @@ npm install react-markdown + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_Pivot列顺序优化总结.md b/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_Pivot列顺序优化总结.md index adf28bf5..cbd5f1d0 100644 --- a/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_Pivot列顺序优化总结.md +++ b/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_Pivot列顺序优化总结.md @@ -196,3 +196,9 @@ FMA___基线 | FMA___1个月 | FMA___2个月 + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_方案B实施总结_2025-12-09.md b/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_方案B实施总结_2025-12-09.md index f8958f9b..1427b763 100644 --- a/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_方案B实施总结_2025-12-09.md +++ b/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_方案B实施总结_2025-12-09.md @@ -354,3 +354,9 @@ formula = "FMA总分(0-100) / 100" + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_缺失值处理_开发进度_2025-12-10.md b/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_缺失值处理_开发进度_2025-12-10.md index 13056d08..d1fb5374 100644 --- a/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_缺失值处理_开发进度_2025-12-10.md +++ b/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_缺失值处理_开发进度_2025-12-10.md @@ -188,3 +188,9 @@ async handleFillnaMice(request, reply) { + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_缺失值处理功能_更新说明.md b/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_缺失值处理功能_更新说明.md index 3acaf9e4..b6a11833 100644 --- a/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_缺失值处理功能_更新说明.md +++ b/docs/03-业务模块/DC-数据清洗整理/04-开发计划/工具C_缺失值处理功能_更新说明.md @@ -160,3 +160,9 @@ method: 'mean' | 'median' | 'mode' | 'constant' | 'ffill' | 'bfill' + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-02_工作总结.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-02_工作总结.md index 2447b09a..e58539e4 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-02_工作总结.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-02_工作总结.md @@ -307,6 +307,12 @@ Changes: + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-06_工具C_Day1开发完成总结.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-06_工具C_Day1开发完成总结.md index 06ffa88f..f8fa28a6 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-06_工具C_Day1开发完成总结.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-06_工具C_Day1开发完成总结.md @@ -382,3 +382,9 @@ cd path; command + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-06_工具C_Day2开发完成总结.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-06_工具C_Day2开发完成总结.md index 2436926a..65c495c5 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-06_工具C_Day2开发完成总结.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-06_工具C_Day2开发完成总结.md @@ -611,3 +611,9 @@ import { logger } from '../../../../common/logging/index.js'; + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_AI对话核心功能增强总结.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_AI对话核心功能增强总结.md index 1378dd3e..ac5921fb 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_AI对话核心功能增强总结.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_AI对话核心功能增强总结.md @@ -615,3 +615,9 @@ Content-Length: 45234 + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_Bug修复_DataGrid空数据防御.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_Bug修复_DataGrid空数据防御.md index 98d4d15f..72eaaad9 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_Bug修复_DataGrid空数据防御.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_Bug修复_DataGrid空数据防御.md @@ -267,3 +267,9 @@ Response: + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_Day5_Ant-Design-X重构完成.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_Day5_Ant-Design-X重构完成.md index 1dbc5a52..b1cd9166 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_Day5_Ant-Design-X重构完成.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_Day5_Ant-Design-X重构完成.md @@ -420,3 +420,9 @@ Response: + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_Day5最终总结.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_Day5最终总结.md index 84dcf414..216d1100 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_Day5最终总结.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_Day5最终总结.md @@ -414,3 +414,9 @@ import { ChatContainer } from '@/shared/components/Chat'; + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_UI优化与Bug修复.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_UI优化与Bug修复.md index c1e59dc8..8eb71d2c 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_UI优化与Bug修复.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_UI优化与Bug修复.md @@ -324,3 +324,9 @@ const initialMessages = defaultMessages.length > 0 ? defaultMessages : [{ + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_后端API完整对接完成.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_后端API完整对接完成.md index 1cb849d6..9b4c7356 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_后端API完整对接完成.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_后端API完整对接完成.md @@ -364,3 +364,9 @@ python main.py + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_完整UI优化与功能增强.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_完整UI优化与功能增强.md index fabde21a..a513b53e 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_完整UI优化与功能增强.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_完整UI优化与功能增强.md @@ -612,3 +612,9 @@ http://localhost:5173/data-cleaning/tool-c + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_工具C_Day4前端基础完成.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_工具C_Day4前端基础完成.md index 5dd93d8e..57fe0e2e 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_工具C_Day4前端基础完成.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/2025-12-07_工具C_Day4前端基础完成.md @@ -222,3 +222,9 @@ Day 5 (6-8小时): + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/DC模块重建完成总结-Day1.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/DC模块重建完成总结-Day1.md index d3612cee..113b8624 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/DC模块重建完成总结-Day1.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/DC模块重建完成总结-Day1.md @@ -393,6 +393,12 @@ Docs: docs/03-业务模块/DC-数据清洗整理/06-开发记录/DC模块重建 + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/Phase1-Portal页面开发完成-2025-12-02.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/Phase1-Portal页面开发完成-2025-12-02.md index d7286950..9d6a19e8 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/Phase1-Portal页面开发完成-2025-12-02.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/Phase1-Portal页面开发完成-2025-12-02.md @@ -372,6 +372,12 @@ const mockAssets: Asset[] = [ + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/Phase2-ToolB-Step1-2开发完成-2025-12-03.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/Phase2-ToolB-Step1-2开发完成-2025-12-03.md index dc287f57..166e5347 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/Phase2-ToolB-Step1-2开发完成-2025-12-03.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/Phase2-ToolB-Step1-2开发完成-2025-12-03.md @@ -357,5 +357,11 @@ frontend-v2/src/modules/dc/ + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/Portal页面UI优化-2025-12-02.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/Portal页面UI优化-2025-12-02.md index fea405f4..923db5ea 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/Portal页面UI优化-2025-12-02.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/Portal页面UI优化-2025-12-02.md @@ -316,6 +316,12 @@ + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/Tool-B-MVP完成总结-2025-12-03.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/Tool-B-MVP完成总结-2025-12-03.md index 570f1171..75c53c26 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/Tool-B-MVP完成总结-2025-12-03.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/Tool-B-MVP完成总结-2025-12-03.md @@ -271,5 +271,11 @@ ConflictDetectionService // 冲突检测(字段级对比) + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/ToolB-UI优化-2025-12-03.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/ToolB-UI优化-2025-12-03.md index f3f73ba8..af8ffe75 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/ToolB-UI优化-2025-12-03.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/ToolB-UI优化-2025-12-03.md @@ -320,5 +320,11 @@ + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/ToolB-UI优化-Round2-2025-12-03.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/ToolB-UI优化-Round2-2025-12-03.md index b2c157d2..159aae8f 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/ToolB-UI优化-Round2-2025-12-03.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/ToolB-UI优化-Round2-2025-12-03.md @@ -283,5 +283,11 @@ + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/ToolB浏览器测试计划-2025-12-03.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/ToolB浏览器测试计划-2025-12-03.md index 32ffcc65..e99f204c 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/ToolB浏览器测试计划-2025-12-03.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/ToolB浏览器测试计划-2025-12-03.md @@ -347,5 +347,11 @@ + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/后端API测试报告-2025-12-02.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/后端API测试报告-2025-12-02.md index 6ae4b75f..bedb65fa 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/后端API测试报告-2025-12-02.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/后端API测试报告-2025-12-02.md @@ -434,6 +434,12 @@ Tool B后端代码**100%复用**了平台通用能力层,无任何重复开发 + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/待办事项-下一步工作.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/待办事项-下一步工作.md index b416e3be..2dff8ec0 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/待办事项-下一步工作.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/待办事项-下一步工作.md @@ -281,5 +281,11 @@ + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/数据库验证报告-2025-12-02.md b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/数据库验证报告-2025-12-02.md index bbf41b6f..0cd46e56 100644 --- a/docs/03-业务模块/DC-数据清洗整理/06-开发记录/数据库验证报告-2025-12-02.md +++ b/docs/03-业务模块/DC-数据清洗整理/06-开发记录/数据库验证报告-2025-12-02.md @@ -211,6 +211,12 @@ $ node scripts/check-dc-tables.mjs + + + + + + diff --git a/docs/03-业务模块/DC-数据清洗整理/07-技术债务/Tool-B技术债务清单.md b/docs/03-业务模块/DC-数据清洗整理/07-技术债务/Tool-B技术债务清单.md index efadda67..ec4d5387 100644 --- a/docs/03-业务模块/DC-数据清洗整理/07-技术债务/Tool-B技术债务清单.md +++ b/docs/03-业务模块/DC-数据清洗整理/07-技术债务/Tool-B技术债务清单.md @@ -445,5 +445,11 @@ ${fields.map((f, i) => `${i + 1}. ${f.name}:${f.desc}`).join('\n')} + + + + + + diff --git a/docs/05-部署文档/01-部署架构设计.md b/docs/05-部署文档/01-部署架构设计.md deleted file mode 100644 index 5d70bf79..00000000 --- a/docs/05-部署文档/01-部署架构设计.md +++ /dev/null @@ -1,52 +0,0 @@ -# 部署架构设计 - -> **文档版本:** v1.0 -> **创建日期:** 2025-10-29 -> **维护者:** 架构团队 -> **最后更新:** 2025-10-29 - ---- - -## 📋 文档说明 - -本文档描述系统的部署架构设计,包括: -- 部署模式(云部署、本地化部署、混合部署) -- 部署方案(Docker、Kubernetes等) -- 环境配置 -- 模块独立部署方案 - ---- - -## ⏳ 待完善 - -本文档内容待规划完善,目前仅作为占位文档。 - ---- - -**文档版本:** v1.0 -**最后更新:** 2025-10-29 - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/05-部署文档/02-SAE部署完全指南(产品经理版).md b/docs/05-部署文档/02-SAE部署完全指南(产品经理版).md index a9e02a07..8979ded6 100644 --- a/docs/05-部署文档/02-SAE部署完全指南(产品经理版).md +++ b/docs/05-部署文档/02-SAE部署完全指南(产品经理版).md @@ -854,3 +854,9 @@ ACR镜像仓库: + + + + + + diff --git a/docs/05-部署文档/03-Dify-ECS部署完全指南.md b/docs/05-部署文档/03-Dify-ECS部署完全指南.md index 48643cd6..75c44933 100644 --- a/docs/05-部署文档/03-Dify-ECS部署完全指南.md +++ b/docs/05-部署文档/03-Dify-ECS部署完全指南.md @@ -168,7 +168,7 @@ RAG 系统迁移的复杂度: 专有网络 VPC: 选择 SAE 所在的 VPC 安全组: 创建新安全组,配置入方向规则(⚠️ 安全红线): ✅ 允许 22/TCP 来源:您的办公室公网IP # SSH管理 - ✅ 允许 80/TCP 来源:172.16.0.0/12 # Nginx(VPC内网访问) + ✅ 允许 80/TCP 来源:172.17.0.0/16 # Nginx(VPC内网访问) ❌ 拒绝 5000/TCP 来源:0.0.0.0/0 # Dify API禁止公网访问 ❌ 拒绝 6379/TCP 来源:0.0.0.0/0 # Redis禁止公网访问 ❌ 拒绝 8080/TCP 来源:0.0.0.0/0 # Weaviate禁止公网访问 @@ -177,7 +177,7 @@ RAG 系统迁移的复杂度: ⚠️ 安全警告: - Dify API (5000)、Redis (6379)、Weaviate (8080) 绝对不能对公网开放 - - 只允许VPC内网访问(172.16.0.0/12) + - 只允许VPC内网访问(172.17.0.0/16) - 端口绑定到 127.0.0.1(见docker-compose.yaml配置) ``` diff --git a/docs/05-部署文档/04-Python微服务-SAE容器部署指南.md b/docs/05-部署文档/04-Python微服务-SAE容器部署指南.md index d3cd067f..479abc18 100644 --- a/docs/05-部署文档/04-Python微服务-SAE容器部署指南.md +++ b/docs/05-部署文档/04-Python微服务-SAE容器部署指南.md @@ -540,19 +540,19 @@ docker rm extraction-test ```bash # 1. 登录阿里云容器镜像服务 # 获取登录命令:阿里云控制台 → 容器镜像服务 → 访问凭证 → 设置Registry登录密码 -docker login --username= registry.cn-hangzhou.aliyuncs.com +docker login --username= registry.cn-beijing.aliyuncs.com # 2. 给镜像打标签 docker tag extraction-service:latest \ - registry.cn-hangzhou.aliyuncs.com/clinical-research/extraction-service:v1.0 + registry.cn-beijing.aliyuncs.com/clinical-research/extraction-service:v1.0 # 3. 推送到阿里云 -docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/extraction-service:v1.0 +docker push registry.cn-beijing.aliyuncs.com/clinical-research/extraction-service:v1.0 # 4. 推送 latest 标签(便于后续更新) docker tag extraction-service:latest \ - registry.cn-hangzhou.aliyuncs.com/clinical-research/extraction-service:latest -docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/extraction-service:latest + registry.cn-beijing.aliyuncs.com/clinical-research/extraction-service:latest +docker push registry.cn-beijing.aliyuncs.com/clinical-research/extraction-service:latest ``` --- @@ -572,7 +572,7 @@ docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/extraction-servi 3. **镜像配置**: ``` - 镜像地址: registry.cn-hangzhou.aliyuncs.com/clinical-research/extraction-service:latest + 镜像地址: registry.cn-beijing.aliyuncs.com/clinical-research/extraction-service:latest 镜像版本: latest 镜像拉取策略: Always(每次部署都拉取最新镜像) ``` @@ -690,7 +690,7 @@ TZ=Asia/Shanghai 3. **查看并复制"内网访问地址"**,通常是以下格式之一: ``` # 格式 1: 内网 IP + 端口(⭐⭐⭐⭐⭐ 强烈推荐,最稳定) - 172.16.0.10:8000 + 172.17.x.x:8000 # 格式 2: SAE 内网 Service 域名(需要额外配置服务发现,不推荐) extraction-service-xxxxx.cn-hangzhou.sae.aliyuncs.com:8000 @@ -716,7 +716,7 @@ TZ=Asia/Shanghai 5. **✅ 推荐做法(按优先级排序)**: ```bash # ⭐⭐⭐⭐⭐ 方案A:直接使用内网IP(强烈推荐) - EXTRACTION_SERVICE_URL=http://172.16.0.10:8000 + EXTRACTION_SERVICE_URL=http://172.17.x.x:8000 # 获取方式:SAE控制台 > Python应用 > 实例列表 > 查看内网IP # ⭐⭐⭐ 方案B:使用SAE服务发现(需要额外配置,不推荐初期使用) @@ -730,7 +730,7 @@ TZ=Asia/Shanghai ```bash # ⚠️ 使用 SAE 控制台显示的真实内网地址 -EXTRACTION_SERVICE_URL=http://172.16.0.10:8000 +EXTRACTION_SERVICE_URL=http://172.17.x.x:8000 # 注意: # 1. 不要使用猜测的域名 @@ -817,7 +817,7 @@ export async function testExtractionService() { 2. **查看 Node.js 后端日志**(SAE 控制台 → 后端应用 → 日志): ``` - [INFO] Calling extraction service: http://172.16.0.10:8000/extract/pdf + [INFO] Calling extraction service: http://172.17.x.x:8000/extract/pdf [INFO] Extraction completed in 2.3s [INFO] Extracted text preview: "This is a test document..." ``` @@ -1050,7 +1050,7 @@ pip list --outdated # 2. 重建镜像(包含安全更新) docker build -t extraction-service:v1.1 . -docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/extraction-service:v1.1 +docker push registry.cn-beijing.aliyuncs.com/clinical-research/extraction-service:v1.1 # 3. 在 SAE 中更新镜像版本 ``` @@ -1131,7 +1131,7 @@ with open(pdf_path, 'rb') as f: ``` 后端日志:Connection refused 或 -ECONNREFUSED: connect ECONNREFUSED 172.16.0.10:8000 +ECONNREFUSED: connect ECONNREFUSED 172.17.x.x:8000 或 Error: getaddrinfo ENOTFOUND extraction-service.internal ``` @@ -1144,7 +1144,7 @@ Error: getaddrinfo ENOTFOUND extraction-service.internal EXTRACTION_SERVICE_URL=http://extraction-service.internal:8000 # ✅ 正确配置(SAE 控制台显示的真实地址) -EXTRACTION_SERVICE_URL=http://172.16.0.10:8000 +EXTRACTION_SERVICE_URL=http://172.17.x.x:8000 ``` **解决方法**: @@ -1300,7 +1300,7 @@ EXTRACTION_SERVICE_URL=http://extraction-service:8000 # ✅ 正确做法:从 SAE 控制台获取真实地址 # SAE 控制台 → extraction-service 应用 → 应用访问配置 # 复制显示的"VPC 内网访问地址" -EXTRACTION_SERVICE_URL=http://172.16.0.10:8000 +EXTRACTION_SERVICE_URL=http://172.17.x.x:8000 ``` **原因**: @@ -1498,7 +1498,7 @@ echo "Done!" docker build -t extraction-service:v1.0 . # 推送镜像 -docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/extraction-service:v1.0 +docker push registry.cn-beijing.aliyuncs.com/clinical-research/extraction-service:v1.0 # 查看 SAE 日志 # SAE 控制台 → 应用详情 → 日志 diff --git a/docs/05-部署文档/05-Node.js后端-SAE容器部署指南.md b/docs/05-部署文档/05-Node.js后端-SAE容器部署指南.md index 00d71b61..da54ebf2 100644 --- a/docs/05-部署文档/05-Node.js后端-SAE容器部署指南.md +++ b/docs/05-部署文档/05-Node.js后端-SAE容器部署指南.md @@ -93,7 +93,7 @@ npm --version - [ ] **RDS PostgreSQL 15** 实例已创建并运行 - 数据库名称:`ai_clinical`(或自定义) - 用户名和密码已准备 - - 内网地址已获取(如 `rm-xxxxx.pg.rds.aliyuncs.com:5432`) + - 内网地址已获取(如 `pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com:5432`) - 白名单已配置(允许 SAE VPC 访问) - [ ] **阿里云容器镜像服务 ACR** 已开通 @@ -104,8 +104,8 @@ npm --version - VPC 和交换机已选择(与 RDS 在同一 VPC) - [ ] **依赖服务的内网地址已获取**: - - Python 微服务(SAE):`http://172.16.0.10:8000` - - Dify 服务(ECS):`http://172.16.0.20:80` + - Python 微服务(SAE):`http://172.17.x.x:8000` + - Dify 服务(ECS):`http://172.17.x.x:80` #### 敏感信息准备 @@ -113,7 +113,7 @@ npm --version ```bash # 数据库 -DATABASE_URL=postgresql://username:password@rm-xxxxx.pg.rds.aliyuncs.com:5432/ai_clinical?connection_limit=18&pool_timeout=10 +DATABASE_URL=postgresql://username:password@pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com:5432/ai_clinical?connection_limit=18&pool_timeout=10 # LLM API Keys(至少配置一个) DEEPSEEK_API_KEY=sk-xxxxx @@ -122,10 +122,10 @@ CLOSEAI_API_KEY=sk-xxxxx # Dify DIFY_API_KEY=app-xxxxx -DIFY_API_URL=http://172.16.0.20:80/v1 +DIFY_API_URL=http://172.17.x.x:80/v1 # 阿里云 OSS -OSS_REGION=oss-cn-hangzhou +OSS_REGION=oss-cn-beijing OSS_BUCKET=clinical-research-files OSS_ACCESS_KEY_ID=LTAI5t... OSS_ACCESS_KEY_SECRET=xxx... @@ -157,10 +157,10 @@ Node.js 后端(SAE) ├──→ RDS PostgreSQL 15(数据库) │ ├──→ Python 微服务(SAE) - 文档提取 - │ └─ http://172.16.0.10:8000 + │ └─ http://172.17.x.x:8000 │ ├──→ Dify 服务(ECS) - RAG 知识库 - │ └─ http://172.16.0.20:80/v1 + │ └─ http://172.17.x.x:80/v1 │ └──→ 阿里云 OSS - 文件存储 └─ clinical-research-files @@ -247,7 +247,7 @@ cp backend/.env backend/.env.backup # 2. 创建临时 RDS 连接配置 cat > backend/.env.rds <:80/v1 # 1. 检查环境变量 # SAE 控制台 → 应用详情 → 环境变量 # 确认以下变量正确: -OSS_REGION=oss-cn-hangzhou +OSS_REGION=oss-cn-beijing OSS_BUCKET=clinical-research-files OSS_ACCESS_KEY_ID=LTAI5t... OSS_ACCESS_KEY_SECRET=xxx... @@ -1825,7 +1825,7 @@ PrismaClientKnownRequestError: column "phone" does not exist ```bash # 1. 在本地开发环境,连接到 RDS -export DATABASE_URL="postgresql://username:password@rm-xxxxx.pg.rds.aliyuncs.com:5432/ai_clinical" +export DATABASE_URL="postgresql://username:password@pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com:5432/ai_clinical" # 2. 反向同步 Schema npx prisma db pull @@ -2006,7 +2006,7 @@ pg_dump → 导入 RDS → prisma db pull(同步)→ 构建镜像 → 部署 ```typescript // ❌ 错误示例 -const dbUrl = 'postgresql://admin:P@ssw0rd@rm-xxxxx.pg.rds.aliyuncs.com:5432/ai_clinical'; +const dbUrl = 'postgresql://admin:P@ssw0rd@pgm-2zex1m2y3r23hdn5.pg.rds.aliyuncs.com:5432/ai_clinical'; // ✅ 正确做法 const dbUrl = process.env.DATABASE_URL; diff --git a/docs/05-部署文档/06-前端Nginx-SAE容器部署指南.md b/docs/05-部署文档/06-前端Nginx-SAE容器部署指南.md index a4515202..9614e197 100644 --- a/docs/05-部署文档/06-前端Nginx-SAE容器部署指南.md +++ b/docs/05-部署文档/06-前端Nginx-SAE容器部署指南.md @@ -106,7 +106,7 @@ npm run build #### 阿里云资源 - [ ] **后端服务(SAE)** 已部署并运行 - - 后端 VPC 内网地址已获取(如 `http://172.16.0.30:3001`) + - 后端 VPC 内网地址已获取(如 `http://172.17.x.x:3001`) - 后端健康检查可访问 - [ ] **阿里云容器镜像服务 ACR** 已开通 @@ -120,7 +120,7 @@ npm run build ```bash # 后端服务内网地址(关键) -BACKEND_SERVICE_URL=http://172.16.0.30:3001 +BACKEND_SERVICE_URL=http://172.17.x.x:3001 # 如果需要配置环境变量(可选) # VITE_API_BASE_URL 在构建时注入(很少使用) @@ -217,7 +217,7 @@ ASL 模块:GET /api/v1/asl/projects ↓ Nginx 反向代理 ↓ -后端服务:http://172.16.0.30:3001/api/v1/asl/projects +后端服务:http://172.17.x.x:3001/api/v1/asl/projects ``` ### 📝 构建流程 @@ -363,7 +363,7 @@ http { server ${BACKEND_SERVICE_HOST}:${BACKEND_SERVICE_PORT} fail_timeout=30s max_fails=3; # 如果有多个后端实例(负载均衡) - # server 172.16.0.30:3001 weight=1; + # server 172.17.x.x:3001 weight=1; # server 172.16.0.31:3001 weight=1; keepalive 32; # 保持连接池 @@ -485,7 +485,7 @@ http { access_log off; # 仅允许内网访问 allow 10.0.0.0/8; - allow 172.16.0.0/12; + allow 172.17.0.0/16; allow 192.168.0.0/16; deny all; } @@ -547,7 +547,7 @@ Nginx:接收请求 ↓ Nginx:proxy_pass http://backend ↓ -后端服务:http://172.16.0.30:3001/api/v1/projects +后端服务:http://172.17.x.x:3001/api/v1/projects ↓ 后端返回数据 ↓ @@ -806,7 +806,7 @@ nginx.conf.template(模板): ↓ envsubst 替换 nginx.conf(最终配置): - server 172.16.0.30:3001; + server 172.17.x.x:3001; ``` #### 3. 健康检查 @@ -987,7 +987,7 @@ docker rm frontend-test ```bash # 登录(使用 ACR 密码,不是阿里云账号密码) -docker login --username=your-aliyun-account registry.cn-hangzhou.aliyuncs.com +docker login --username=your-aliyun-account registry.cn-beijing.aliyuncs.com # 输入密码后看到: # Login Succeeded @@ -998,21 +998,21 @@ docker login --username=your-aliyun-account registry.cn-hangzhou.aliyuncs.com ```bash # 格式:registry地址/命名空间/仓库名:版本号 docker tag frontend-service:v1.0.0 \ - registry.cn-hangzhou.aliyuncs.com/clinical-research/frontend-service:v1.0.0 + registry.cn-beijing.aliyuncs.com/clinical-research/frontend-service:v1.0.0 # 同时打一个 latest 标签 docker tag frontend-service:v1.0.0 \ - registry.cn-hangzhou.aliyuncs.com/clinical-research/frontend-service:latest + registry.cn-beijing.aliyuncs.com/clinical-research/frontend-service:latest ``` ### 步骤 3:推送镜像 ```bash # 推送指定版本 -docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/frontend-service:v1.0.0 +docker push registry.cn-beijing.aliyuncs.com/clinical-research/frontend-service:v1.0.0 # 推送 latest -docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/frontend-service:latest +docker push registry.cn-beijing.aliyuncs.com/clinical-research/frontend-service:latest # 推送过程需要 1-3 分钟(镜像很小) ``` @@ -1055,7 +1055,7 @@ docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/frontend-service | 配置项 | 值 | |-------|-----| | **镜像类型** | 容器镜像服务企业版实例 | -| **镜像仓库** | `registry.cn-hangzhou.aliyuncs.com/clinical-research/frontend-service` | +| **镜像仓库** | `registry.cn-beijing.aliyuncs.com/clinical-research/frontend-service` | | **镜像版本** | `v1.0.0` | | **镜像拉取策略** | 总是拉取镜像 | @@ -1085,7 +1085,7 @@ docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/frontend-service ```bash # ⚠️ 必须配置(否则容器启动失败) -BACKEND_SERVICE_HOST=172.16.0.30 +BACKEND_SERVICE_HOST=172.17.x.x # 可选配置(默认 3001) BACKEND_SERVICE_PORT=3001 @@ -1108,7 +1108,7 @@ upstream backend { server ${BACKEND_SERVICE_URL}; # ❌ 无法解析 http://172.16.0.30:3001 # 拆分后: -server 172.16.0.30:3001; # ✅ 正确 +server 172.17.x.x:3001; # ✅ 正确 ``` ### 步骤 5:配置健康检查 @@ -1337,7 +1337,7 @@ API 代理:响应时间 50-500ms(取决于后端) # ✅ 正常启动 ============================================ Starting Frontend Nginx Service -Backend Service: 172.16.0.30:3001 +Backend Service: 172.17.x.x:3001 ============================================ nginx: configuration file /etc/nginx/nginx.conf test is successful @@ -1354,7 +1354,7 @@ nginx: configuration file /etc/nginx/nginx.conf test is successful # ❌ 错误日志(后端连接失败) 2025/12/13 10:30:04 [error] 7#7: *1 connect() failed (111: Connection refused) while connecting to upstream client: 172.31.0.10, server: _, request: "GET /api/v1/projects HTTP/1.1" -upstream: "http://172.16.0.30:3001/api/v1/projects" +upstream: "http://172.17.x.x:3001/api/v1/projects" ``` #### 3. Nginx 状态监控 @@ -1419,7 +1419,7 @@ curl http://localhost/nginx_status cd frontend npm run build docker build -t frontend-service:v1.0.1 . -docker push registry.cn-hangzhou.aliyuncs.com/clinical-research/frontend-service:v1.0.1 +docker push registry.cn-beijing.aliyuncs.com/clinical-research/frontend-service:v1.0.1 # 2. 在 SAE 中更新镜像 # SAE 控制台 → 应用详情 → 部署 @@ -1556,7 +1556,7 @@ location / { # 2. 测试后端内网地址是否可达 # 登录前端应用的 Webshell: -curl http://172.16.0.30:3001/api/v1/health +curl http://172.17.x.x:3001/api/v1/health # 如果返回错误,说明: # - 后端服务未启动 @@ -1567,7 +1567,7 @@ curl http://172.16.0.30:3001/api/v1/health cat /etc/nginx/nginx.conf | grep -A 5 "upstream backend" # 应该看到正确的后端地址: -# server 172.16.0.30:3001 fail_timeout=30s max_fails=3; +# server 172.17.x.x:3001 fail_timeout=30s max_fails=3; # 4. 查看 Nginx 错误日志 tail -f /var/log/nginx/error.log | grep "upstream" @@ -1579,14 +1579,14 @@ tail -f /var/log/nginx/error.log | grep "upstream" # 方法 1:更新环境变量 # SAE 控制台 → frontend-service → 应用配置 → 环境变量 # 确认: -BACKEND_SERVICE_HOST=172.16.0.30 # 正确的内网 IP +BACKEND_SERVICE_HOST=172.17.x.x # 正确的内网 IP BACKEND_SERVICE_PORT=3001 # 重启应用使环境变量生效 # 方法 2:测试内网连通性 # 在前端 Webshell 中: -telnet 172.16.0.30 3001 +telnet 172.17.x.x 3001 # 如果连接失败,检查: # - 后端和前端是否在同一 VPC # - 安全组规则是否允许访问 @@ -1744,7 +1744,7 @@ cat /docker-entrypoint.sh | grep "envsubst" cat /etc/nginx/nginx.conf | grep "server.*backend" # 应该看到: -# server 172.16.0.30:3001; +# server 172.17.x.x:3001; # 如果看到: # server ${BACKEND_SERVICE_HOST}:${BACKEND_SERVICE_PORT}; # ❌ 未替换 @@ -1809,11 +1809,11 @@ export default defineConfig({ ```bash # ✅ 正确做法:拆分 Host 和 Port -BACKEND_SERVICE_HOST=172.16.0.30 +BACKEND_SERVICE_HOST=172.17.x.x BACKEND_SERVICE_PORT=3001 # ❌ 错误做法:完整 URL -BACKEND_SERVICE_URL=http://172.16.0.30:3001 +BACKEND_SERVICE_URL=http://172.17.x.x:3001 # Nginx 无法解析协议前缀 ``` diff --git a/docs/05-部署文档/文档修正报告-20251214.md b/docs/05-部署文档/文档修正报告-20251214.md index 16146a0a..00346543 100644 --- a/docs/05-部署文档/文档修正报告-20251214.md +++ b/docs/05-部署文档/文档修正报告-20251214.md @@ -465,3 +465,9 @@ NAT网关成本¥100/月,对初创团队是一笔开销 **审查依据:** 专业技术团队反馈 **修正质量:** ⭐⭐⭐⭐⭐(8/8问题已全部修正) + + + + + + diff --git a/docs/07-运维文档/03-SAE环境变量配置指南.md b/docs/07-运维文档/03-SAE环境变量配置指南.md index 74401f27..51c8ca08 100644 --- a/docs/07-运维文档/03-SAE环境变量配置指南.md +++ b/docs/07-运维文档/03-SAE环境变量配置指南.md @@ -370,3 +370,9 @@ curl http://你的SAE地址:3001/health + + + + + + diff --git a/docs/07-运维文档/05-Redis缓存与队列的区别说明.md b/docs/07-运维文档/05-Redis缓存与队列的区别说明.md index 98fc1583..cdbbc54d 100644 --- a/docs/07-运维文档/05-Redis缓存与队列的区别说明.md +++ b/docs/07-运维文档/05-Redis缓存与队列的区别说明.md @@ -702,3 +702,9 @@ const job = await queue.getJob(jobId); + + + + + + diff --git a/docs/07-运维文档/06-长时间任务可靠性分析.md b/docs/07-运维文档/06-长时间任务可靠性分析.md index 960d7bde..571893cb 100644 --- a/docs/07-运维文档/06-长时间任务可靠性分析.md +++ b/docs/07-运维文档/06-长时间任务可靠性分析.md @@ -469,3 +469,9 @@ processLiteraturesInBackground(task.id, projectId, testLiteratures); + + + + + + diff --git a/docs/07-运维文档/07-Redis使用需求分析(按模块).md b/docs/07-运维文档/07-Redis使用需求分析(按模块).md index 8b526f72..115e3fb5 100644 --- a/docs/07-运维文档/07-Redis使用需求分析(按模块).md +++ b/docs/07-运维文档/07-Redis使用需求分析(按模块).md @@ -946,3 +946,9 @@ ROI = (¥22,556 - ¥144) / ¥144 × 100% = 15,564% + + + + + + diff --git a/docs/08-项目管理/03-每周计划/2025-12-13-Postgres-Only架构改造完成.md b/docs/08-项目管理/03-每周计划/2025-12-13-Postgres-Only架构改造完成.md index 9c1f7737..1acd0eab 100644 --- a/docs/08-项目管理/03-每周计划/2025-12-13-Postgres-Only架构改造完成.md +++ b/docs/08-项目管理/03-每周计划/2025-12-13-Postgres-Only架构改造完成.md @@ -1003,3 +1003,9 @@ Redis 实例:¥500/月 **下次更新:** Phase 8 完成后 + + + + + + diff --git a/docs/08-项目管理/05-技术债务/通用对话服务抽取计划.md b/docs/08-项目管理/05-技术债务/通用对话服务抽取计划.md index 64c91204..5a670b5d 100644 --- a/docs/08-项目管理/05-技术债务/通用对话服务抽取计划.md +++ b/docs/08-项目管理/05-技术债务/通用对话服务抽取计划.md @@ -461,3 +461,9 @@ import { ChatContainer } from '@/shared/components/Chat'; + + + + + + diff --git a/extraction_service/main.py b/extraction_service/main.py index 417aba08..6076c5f7 100644 --- a/extraction_service/main.py +++ b/extraction_service/main.py @@ -70,6 +70,17 @@ from operations.conditional import apply_conditional_column, apply_simple_binnin from operations.dropna import drop_missing_values, get_missing_summary from operations.compute import compute_column, get_formula_examples from operations.pivot import pivot_long_to_wide, get_pivot_preview +from operations.unpivot import apply_unpivot, get_unpivot_preview # ✨ 新增:宽表转长表 +from operations.metric_time_transform import ( + apply_metric_time_transform, + detect_common_pattern, + preview_metric_time_transform, + detect_metric_groups, # ✨ 多指标自动分组 + apply_multi_metric_to_long, # ✨ 多指标转长表(方向1) + preview_multi_metric_to_long, # ✨ 多指标转换预览(方向1) + apply_multi_metric_to_matrix, # ✨ 多指标转矩阵(方向2) + preview_multi_metric_to_matrix # ✨ 多指标转换预览(方向2) +) from operations.fillna import fillna_simple, fillna_mice, get_column_missing_stats @@ -149,6 +160,59 @@ class PivotRequest(BaseModel): pivot_value_order: List[str] = [] # ✨ 新增:透视列值的原始顺序 +class UnpivotRequest(BaseModel): + """Unpivot请求模型(宽表转长表)""" + data: List[Dict[str, Any]] + id_vars: List[str] # ID列(保持不变的列) + value_vars: List[str] # 值列(需要转换的列) + var_name: str = '变量' # 变量名列名 + value_name: str = '值' # 值列名 + parse_column_names: bool = False # 是否解析列名 + separator: str = '_' # 分隔符 + metric_name: Optional[str] = None # 指标列名 + time_name: Optional[str] = None # 时间列名 + dropna: bool = False # 是否删除缺失值行 + + +class MetricTimeTransformRequest(BaseModel): + """指标-时间表转换请求模型""" + data: List[Dict[str, Any]] + id_vars: List[str] # ID列(保持不变的列) + value_vars: List[str] # 值列(同一指标的多个时间点) + metric_name: Optional[str] = None # 指标名称(如果为None,则自动检测) + separator: Optional[str] = None # 分隔符(如果为None,则自动检测) + timepoint_col_name: str = '时间点' # 时间点列名 + + +class MetricTimeDetectRequest(BaseModel): + """指标-时间表模式检测请求模型""" + value_vars: List[str] # 值列(用于检测模式) + + +class MultiMetricDetectRequest(BaseModel): + """多指标分组检测请求模型""" + value_vars: List[str] # 值列(用于检测分组) + separators: Optional[List[str]] = None # 可选的分隔符列表 + + +class MultiMetricToLongRequest(BaseModel): + """多指标转长表请求模型(方向1)""" + data: List[Dict[str, Any]] + id_vars: List[str] # ID列 + value_vars: List[str] # 值列(多个指标的多个时间点) + separators: Optional[List[str]] = None # 可选的分隔符列表 + event_col_name: str = 'Event_Name' # 时间点列名 + + +class MultiMetricToMatrixRequest(BaseModel): + """多指标转矩阵请求模型(方向2)""" + data: List[Dict[str, Any]] + id_vars: List[str] # ID列 + value_vars: List[str] # 值列(多个指标的多个时间点) + separators: Optional[List[str]] = None # 可选的分隔符列表 + metric_col_name: str = '指标名' # 指标列名 + + class FillnaStatsRequest(BaseModel): """获取列缺失值统计请求模型""" data: List[Dict[str, Any]] @@ -1292,6 +1356,515 @@ async def operation_pivot(request: PivotRequest): }, status_code=400) +@app.post("/api/operations/unpivot") +async def operation_unpivot(request: UnpivotRequest): + """ + Unpivot操作:宽表转长表(预写函数) + + 将横向数据转为纵向重复数据 + + 典型医学场景: + - 多时间点随访数据(FMA_基线、FMA_2周 → 时间点列 + FMA值列) + - 多指标合并分析(收缩压、舒张压 → 指标列 + 测量值列) + + Args: + request: UnpivotRequest + - data: 数据 + - id_vars: ID列(保持不变的列) + - value_vars: 值列(需要转换的列) + - var_name: 变量名列名(默认:"变量") + - value_name: 值列名(默认:"值") + - parse_column_names: 是否解析列名(默认:False) + - separator: 分隔符(默认:"_") + - metric_name: 指标列名(可选) + - time_name: 时间列名(可选) + - dropna: 是否删除缺失值行(默认:False) + + Returns: + { + "success": bool, + "result_data": List[Dict], + "output": str, + "execution_time": float, + "result_shape": [rows, cols] + } + """ + try: + import pandas as pd + import numpy as np + import time + import io + import sys + + start_time = time.time() + + # 捕获打印输出 + captured_output = io.StringIO() + sys.stdout = captured_output + + try: + # 转换为DataFrame + df = pd.DataFrame(request.data) + + # ✨ 调用预写函数 + result_df = apply_unpivot( + df, + request.id_vars, + request.value_vars, + request.var_name, + request.value_name, + request.parse_column_names, + request.separator, + request.metric_name, + request.time_name, + request.dropna + ) + + # 转换回JSON(处理NaN和inf值) + result_df = result_df.replace([np.inf, -np.inf], None) + result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None}) + result_data = result_df_clean.to_dict('records') + + # 恢复stdout + sys.stdout = sys.__stdout__ + output = captured_output.getvalue() + + execution_time = time.time() - start_time + + logger.info(f"Unpivot成功: {len(request.id_vars)} ID列 × {len(request.value_vars)} 值列 → {len(result_data)} 行") + + return JSONResponse(content={ + "success": True, + "result_data": result_data, + "output": output, + "execution_time": execution_time, + "result_shape": [len(result_data), len(result_df.columns)] + }) + + except Exception as e: + sys.stdout = sys.__stdout__ + output = captured_output.getvalue() + raise e + + except Exception as e: + logger.error(f"Unpivot操作失败: {str(e)}") + return JSONResponse(content={ + "success": False, + "error": str(e), + "execution_time": time.time() - start_time if 'start_time' in locals() else 0 + }, status_code=400) + + +@app.post("/api/operations/metric-time/detect") +async def operation_metric_time_detect(request: MetricTimeDetectRequest): + """ + 检测指标-时间表转换模式 + + 自动分析列名,检测: + - 公共前缀(指标名) + - 分隔符 + - 时间点列表 + - 置信度 + + Args: + request: MetricTimeDetectRequest + - value_vars: 值列列表 + + Returns: + { + "success": bool, + "pattern": { + "common_prefix": str, + "separator": str, + "timepoints": List[str], + "confidence": float, + "message": str + } + } + """ + try: + import time + + start_time = time.time() + + logger.info(f"检测指标-时间表模式: {len(request.value_vars)} 列") + + # 调用检测函数 + pattern = detect_common_pattern(request.value_vars) + + execution_time = time.time() - start_time + + logger.info(f"模式检测完成: confidence={pattern.get('confidence', 0):.2f}") + + return JSONResponse(content={ + "success": pattern['success'], + "pattern": pattern, + "execution_time": execution_time + }) + + except Exception as e: + logger.error(f"模式检测失败: {str(e)}") + return JSONResponse(content={ + "success": False, + "error": str(e), + "execution_time": time.time() - start_time if 'start_time' in locals() else 0 + }, status_code=400) + + +@app.post("/api/operations/metric-time") +async def operation_metric_time_transform(request: MetricTimeTransformRequest): + """ + 指标-时间表转换操作(预写函数) + + 将多个时间点列转换为"指标行+时间点列"格式 + + 典型场景: + - 制作临床研究Table 1 + - 横向对比同一指标的时间变化 + + Args: + request: MetricTimeTransformRequest + - data: 数据 + - id_vars: ID列(保持不变) + - value_vars: 值列(同一指标的多个时间点) + - metric_name: 指标名称(可选,自动检测) + - separator: 分隔符(可选,自动检测) + - timepoint_col_name: 时间点列名 + + Returns: + { + "success": bool, + "result_data": List[Dict], + "output": str, + "execution_time": float, + "result_shape": [rows, cols] + } + """ + try: + import pandas as pd + import numpy as np + import time + import io + import sys + + start_time = time.time() + + # 捕获打印输出 + captured_output = io.StringIO() + sys.stdout = captured_output + + try: + # 转换为DataFrame + df = pd.DataFrame(request.data) + + # ✨ 调用预写函数 + result_df = apply_metric_time_transform( + df, + request.id_vars, + request.value_vars, + request.metric_name, + request.separator, + request.timepoint_col_name + ) + + # 转换回JSON(处理NaN和inf值) + result_df = result_df.replace([np.inf, -np.inf], None) + result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None}) + result_data = result_df_clean.to_dict('records') + + # 恢复stdout + sys.stdout = sys.__stdout__ + output = captured_output.getvalue() + + execution_time = time.time() - start_time + + logger.info(f"指标-时间表转换成功: {len(request.id_vars)} ID列 × {len(request.value_vars)} 值列 → {len(result_df.columns)} 列") + + return JSONResponse(content={ + "success": True, + "result_data": result_data, + "output": output, + "execution_time": execution_time, + "result_shape": [len(result_data), len(result_df.columns)] + }) + + except Exception as e: + sys.stdout = sys.__stdout__ + output = captured_output.getvalue() + raise e + + except Exception as e: + logger.error(f"指标-时间表转换失败: {str(e)}") + return JSONResponse(content={ + "success": False, + "error": str(e), + "execution_time": time.time() - start_time if 'start_time' in locals() else 0 + }, status_code=400) + + +# ==================== 多指标转换API ==================== + +@app.post("/api/operations/multi-metric/detect") +async def operation_multi_metric_detect(request: MultiMetricDetectRequest): + """ + 多指标自动分组检测 + + 检测多个指标的列并自动分组 + + Args: + request: MultiMetricDetectRequest + - value_vars: 值列列表 + - separators: 可选的分隔符列表 + + Returns: + { + "success": bool, + "metric_groups": Dict[str, List[str]], # 指标分组 + "separator": str, # 检测到的分隔符 + "timepoints": List[str], # 时间点列表 + "confidence": float, # 置信度 + "message": str + } + """ + try: + result = detect_metric_groups( + request.value_vars, + request.separators + ) + + logger.info(f"多指标分组检测: {len(request.value_vars)} 列 → {len(result.get('metric_groups', {}))} 个指标") + + return JSONResponse(content=result) + + except Exception as e: + logger.error(f"多指标分组检测失败: {str(e)}") + return JSONResponse(content={ + "success": False, + "error": str(e) + }, status_code=400) + + +@app.post("/api/operations/multi-metric/to-long") +async def operation_multi_metric_to_long(request: MultiMetricToLongRequest): + """ + 多指标转长表(时间点为行,指标为列) + + 将多个指标的宽表转换为长表格式,适合统计分析和可视化 + + 典型场景: + - 纵向研究数据分析 + - 重复测量数据准备 + - 混合效应模型、GEE分析 + - 数据可视化(ggplot2、seaborn) + + Args: + request: MultiMetricToLongRequest + - data: 数据 + - id_vars: ID列 + - value_vars: 值列(多个指标的多个时间点) + - separators: 可选的分隔符列表 + - event_col_name: 时间点列名 + + Returns: + { + "success": bool, + "result_data": List[Dict], + "grouping": {...}, # 分组信息 + "output": str, + "execution_time": float, + "result_shape": [rows, cols] + } + """ + try: + import pandas as pd + import numpy as np + import time + import io + import sys + + start_time = time.time() + + # 捕获打印输出 + captured_output = io.StringIO() + sys.stdout = captured_output + + try: + # 转换为DataFrame + df = pd.DataFrame(request.data) + + # 1. 先检测分组 + grouping = detect_metric_groups( + request.value_vars, + request.separators + ) + + if not grouping['success']: + sys.stdout = sys.__stdout__ + output = captured_output.getvalue() + return JSONResponse(content={ + "success": False, + "error": grouping['message'], + "output": output + }, status_code=400) + + # 2. 执行转换 + result_df = apply_multi_metric_to_long( + df, + request.id_vars, + grouping['metric_groups'], + grouping['separator'], + request.event_col_name + ) + + # 转换回JSON(处理NaN和inf值) + result_df = result_df.replace([np.inf, -np.inf], None) + result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None}) + result_data = result_df_clean.to_dict('records') + + # 恢复stdout + sys.stdout = sys.__stdout__ + output = captured_output.getvalue() + + execution_time = time.time() - start_time + + logger.info(f"多指标转长表成功: {len(grouping['metric_groups'])} 指标 × {len(grouping['timepoints'])} 时间点 → {len(result_df)} 行") + + return JSONResponse(content={ + "success": True, + "result_data": result_data, + "grouping": grouping, + "output": output, + "execution_time": execution_time, + "result_shape": [len(result_data), len(result_df.columns)] + }) + + except Exception as e: + sys.stdout = sys.__stdout__ + output = captured_output.getvalue() + raise e + + except Exception as e: + logger.error(f"多指标转长表失败: {str(e)}") + import traceback + traceback.print_exc() + return JSONResponse(content={ + "success": False, + "error": str(e), + "execution_time": time.time() - start_time if 'start_time' in locals() else 0 + }, status_code=400) + + +@app.post("/api/operations/multi-metric/to-matrix") +async def operation_multi_metric_to_matrix(request: MultiMetricToMatrixRequest): + """ + 多指标转矩阵(时间点为列,指标为行) + + 将多个指标的宽表转换为矩阵格式,适合临床报告和数据审查 + + 典型场景: + - 临床研究报告 + - 数据审查表 + - CRF核对 + - 单受试者数据审查 + + Args: + request: MultiMetricToMatrixRequest + - data: 数据 + - id_vars: ID列 + - value_vars: 值列(多个指标的多个时间点) + - separators: 可选的分隔符列表 + - metric_col_name: 指标列名 + + Returns: + { + "success": bool, + "result_data": List[Dict], + "grouping": {...}, # 分组信息 + "output": str, + "execution_time": float, + "result_shape": [rows, cols] + } + """ + try: + import pandas as pd + import numpy as np + import time + import io + import sys + + start_time = time.time() + + # 捕获打印输出 + captured_output = io.StringIO() + sys.stdout = captured_output + + try: + # 转换为DataFrame + df = pd.DataFrame(request.data) + + # 1. 先检测分组 + grouping = detect_metric_groups( + request.value_vars, + request.separators + ) + + if not grouping['success']: + sys.stdout = sys.__stdout__ + output = captured_output.getvalue() + return JSONResponse(content={ + "success": False, + "error": grouping['message'], + "output": output + }, status_code=400) + + # 2. 执行转换 + result_df = apply_multi_metric_to_matrix( + df, + request.id_vars, + grouping['metric_groups'], + grouping['separator'], + 'Event_Name', + request.metric_col_name + ) + + # 转换回JSON(处理NaN和inf值) + result_df = result_df.replace([np.inf, -np.inf], None) + result_df_clean = result_df.fillna(value=pd.NA).replace({pd.NA: None}) + result_data = result_df_clean.to_dict('records') + + # 恢复stdout + sys.stdout = sys.__stdout__ + output = captured_output.getvalue() + + execution_time = time.time() - start_time + + logger.info(f"多指标转矩阵成功: {len(grouping['metric_groups'])} 指标 × {len(grouping['timepoints'])} 时间点 → {len(result_df)} 行") + + return JSONResponse(content={ + "success": True, + "result_data": result_data, + "grouping": grouping, + "output": output, + "execution_time": execution_time, + "result_shape": [len(result_data), len(result_df.columns)] + }) + + except Exception as e: + sys.stdout = sys.__stdout__ + output = captured_output.getvalue() + raise e + + except Exception as e: + logger.error(f"多指标转矩阵失败: {str(e)}") + import traceback + traceback.print_exc() + return JSONResponse(content={ + "success": False, + "error": str(e), + "execution_time": time.time() - start_time if 'start_time' in locals() else 0 + }, status_code=400) + + @app.post("/api/operations/fillna-stats") async def operation_fillna_stats(request: FillnaStatsRequest): """ diff --git a/extraction_service/operations/__init__.py b/extraction_service/operations/__init__.py index 7abc9ccb..8205d154 100644 --- a/extraction_service/operations/__init__.py +++ b/extraction_service/operations/__init__.py @@ -24,3 +24,9 @@ __version__ = '1.0.0' + + + + + + diff --git a/extraction_service/operations/dropna.py b/extraction_service/operations/dropna.py index 2a1f404c..1ad67449 100644 --- a/extraction_service/operations/dropna.py +++ b/extraction_service/operations/dropna.py @@ -157,3 +157,9 @@ def get_missing_summary(df: pd.DataFrame) -> dict: + + + + + + diff --git a/extraction_service/operations/filter.py b/extraction_service/operations/filter.py index 9a401015..62f3e2f2 100644 --- a/extraction_service/operations/filter.py +++ b/extraction_service/operations/filter.py @@ -117,3 +117,9 @@ def apply_filter( + + + + + + diff --git a/extraction_service/operations/metric_time_transform.py b/extraction_service/operations/metric_time_transform.py new file mode 100644 index 00000000..8ff7e04f --- /dev/null +++ b/extraction_service/operations/metric_time_transform.py @@ -0,0 +1,921 @@ +""" +指标-时间表转换(Metric-Time Transform) + +将多个时间点列转换为"指标行+时间点列"格式 +典型医学场景: +- 制作临床研究Table 1 +- 横向对比同一指标的时间变化 +- 多时间点随访数据整理 + +示例: +输入(宽表): + Record_ID | FMA___基线 | FMA___2周 | FMA___1月 + 10 | 54 | 93 | 68 + 11 | 16 | 31 | 72 + +输出(指标-时间表): + Record_ID | 时间点 | 基线 | 2周 | 1月 + 10 | FMA | 54 | 93 | 68 + 11 | FMA | 16 | 31 | 72 +""" + +import pandas as pd +import numpy as np +from typing import List, Optional, Dict, Any +import os +from collections import defaultdict + + +def detect_common_pattern(column_names: List[str]) -> Dict[str, Any]: + """ + 自动检测列名的公共模式(前缀、分隔符、时间点) + + Args: + column_names: 列名列表 + + Returns: + { + 'success': bool, + 'common_prefix': str, # 公共前缀(指标名) + 'separator': str, # 分隔符 + 'timepoints': List[str], # 时间点列表 + 'confidence': float, # 置信度 0-1 + 'message': str # 提示信息 + } + + Examples: + >>> cols = ['FMA总得分___筛选及基线', 'FMA总得分___随访(2周)', 'FMA总得分___随访(1个月)'] + >>> result = detect_common_pattern(cols) + >>> result['common_prefix'] + 'FMA总得分' + >>> result['separator'] + '___' + >>> result['timepoints'] + ['筛选及基线', '随访(2周)', '随访(1个月)'] + """ + print(f"\n🔍 开始自动检测列名模式...", flush=True) + print(f" 输入列数: {len(column_names)}", flush=True) + + if len(column_names) < 2: + return { + 'success': False, + 'common_prefix': '', + 'separator': '', + 'timepoints': [], + 'confidence': 0.0, + 'message': '至少需要2列才能检测模式' + } + + # 打印前3个列名作为样本 + print(f" 样本列名:", flush=True) + for i, col in enumerate(column_names[:3]): + print(f" [{i+1}] {col}", flush=True) + if len(column_names) > 3: + print(f" ... 还有 {len(column_names) - 3} 列", flush=True) + + # ==================== 1. 检测最长公共前缀 ==================== + common_prefix = os.path.commonprefix(column_names) + print(f"\n ✓ 检测到公共前缀: '{common_prefix}'", flush=True) + + if not common_prefix: + return { + 'success': False, + 'common_prefix': '', + 'separator': '', + 'timepoints': [], + 'confidence': 0.0, + 'message': '未检测到公共前缀,选中的列可能不属于同一指标' + } + + # ==================== 2. 检测分隔符 ==================== + # 尝试常见分隔符(按优先级排序) + separators = ['___', '__', '_', '-', '.', '|', ' - ', ' '] + detected_separator = None + + # 方法1:检查公共前缀是否以分隔符结尾 + for sep in separators: + if common_prefix.endswith(sep): + detected_separator = sep + common_prefix = common_prefix[:-len(sep)] # 移除尾部分隔符 + print(f" ✓ 检测到分隔符: '{sep}' (位于公共前缀末尾)", flush=True) + break + + # 方法2:如果公共前缀末尾没有分隔符,尝试从剩余部分检测 + if not detected_separator: + remainders = [col[len(common_prefix):] for col in column_names] + for sep in separators: + if all(r.startswith(sep) for r in remainders if r): + detected_separator = sep + print(f" ✓ 检测到分隔符: '{sep}' (位于剩余部分开头)", flush=True) + break + + # ✨ 方法3:智能修正 - 如果剩余部分仍包含分隔符,尝试扩展公共前缀 + if detected_separator: + remainders = [col[len(common_prefix):] for col in column_names] + + # 检查每个剩余部分,看分隔符前是否还有公共部分 + parts_before_sep = [] + for remainder in remainders: + if detected_separator in remainder: + # 找到第一个分隔符的位置 + sep_pos = remainder.find(detected_separator) + part = remainder[:sep_pos] + parts_before_sep.append(part) + else: + parts_before_sep.append('') + + # 如果所有剩余部分在分隔符前都有内容,且内容相同,则扩展公共前缀 + if parts_before_sep and all(p == parts_before_sep[0] for p in parts_before_sep if p): + additional_prefix = parts_before_sep[0] + if additional_prefix: + print(f" 🔄 智能修正: 扩展公共前缀 '{common_prefix}' → '{common_prefix}{additional_prefix}'", flush=True) + common_prefix = common_prefix + additional_prefix + + if not detected_separator: + print(f" ⚠️ 未检测到明确分隔符,使用空字符串", flush=True) + detected_separator = '' + + # ==================== 3. 提取时间点 ==================== + if detected_separator: + # ✨ 修复:正确移除分隔符(移除整个分隔符字符串,而不是lstrip) + timepoints = [] + for col in column_names: + remainder = col[len(common_prefix):] + # 如果剩余部分以分隔符开头,移除它 + if remainder.startswith(detected_separator): + timepoint = remainder[len(detected_separator):] + else: + timepoint = remainder + timepoints.append(timepoint.strip()) + else: + # 没有分隔符,整个剩余部分作为时间点 + timepoints = [col[len(common_prefix):].strip() for col in column_names] + + print(f" ✓ 提取到 {len(timepoints)} 个时间点:", flush=True) + for i, tp in enumerate(timepoints[:5]): + print(f" [{i+1}] {tp}", flush=True) + if len(timepoints) > 5: + print(f" ... 还有 {len(timepoints) - 5} 个", flush=True) + + # ==================== 4. 计算置信度 ==================== + confidence = 1.0 + + # 检查:时间点不能为空 + empty_count = sum(1 for tp in timepoints if not tp) + if empty_count > 0: + confidence -= 0.3 + print(f" ⚠️ 发现 {empty_count} 个空时间点,降低置信度", flush=True) + + # 检查:时间点应该各不相同 + unique_timepoints = len(set(timepoints)) + if unique_timepoints < len(timepoints): + confidence -= 0.2 + print(f" ⚠️ 时间点有重复,降低置信度", flush=True) + + # 检查:公共前缀不应该太短 + if len(common_prefix) < 2: + confidence -= 0.2 + print(f" ⚠️ 公共前缀过短,降低置信度", flush=True) + + confidence = max(0.0, min(1.0, confidence)) + + print(f"\n 📊 检测置信度: {confidence:.0%}", flush=True) + + # ==================== 5. 生成消息 ==================== + if confidence >= 0.8: + message = f"成功检测:指标='{common_prefix}', 分隔符='{detected_separator}', {len(timepoints)}个时间点" + elif confidence >= 0.5: + message = f"检测成功但有警告,建议检查结果" + else: + message = f"检测置信度较低,建议手动指定参数" + + return { + 'success': True, + 'common_prefix': common_prefix, + 'separator': detected_separator, + 'timepoints': timepoints, + 'confidence': confidence, + 'message': message + } + + +def apply_metric_time_transform( + df: pd.DataFrame, + id_vars: List[str], + value_vars: List[str], + metric_name: Optional[str] = None, + separator: Optional[str] = None, + timepoint_col_name: str = '时间点' +) -> pd.DataFrame: + """ + 应用指标-时间表转换 + + Args: + df: 输入数据框 + id_vars: ID列(保持不变的列) + value_vars: 值列(同一指标的多个时间点) + metric_name: 指标名称(如果为None,则自动检测) + separator: 分隔符(如果为None,则自动检测) + timepoint_col_name: 时间点列的列名(默认:"时间点") + + Returns: + 转换后的数据框 + + Examples: + >>> df = pd.DataFrame({ + ... 'Record_ID': [10, 11], + ... 'FMA___基线': [54, 16], + ... 'FMA___2周': [93, 31], + ... 'FMA___1月': [68, 72] + ... }) + >>> result = apply_metric_time_transform( + ... df, + ... id_vars=['Record_ID'], + ... value_vars=['FMA___基线', 'FMA___2周', 'FMA___1月'] + ... ) + >>> result.columns.tolist() + ['Record_ID', '时间点', '基线', '2周', '1月'] + """ + print("\n" + "="*60, flush=True) + print("🔄 开始指标-时间表转换...", flush=True) + print("="*60, flush=True) + + # ==================== 参数验证 ==================== + if df.empty: + print("⚠️ 输入数据框为空", flush=True) + return df + + if not id_vars: + raise ValueError('❌ 至少需要选择1个ID列') + + if len(value_vars) < 2: + raise ValueError('❌ 至少需要选择2个值列') + + # 验证列是否存在 + for col in id_vars + value_vars: + if col not in df.columns: + raise KeyError(f"❌ 列 '{col}' 不存在") + + print(f"\n📊 转换前数据概况:", flush=True) + print(f" - 总行数: {len(df)}", flush=True) + print(f" - ID列: {len(id_vars)} 个 ({', '.join(id_vars)})", flush=True) + print(f" - 值列: {len(value_vars)} 个", flush=True) + + # ==================== 自动检测或使用指定参数 ==================== + if not metric_name or separator is None: + print(f"\n🔍 自动检测模式...", flush=True) + pattern = detect_common_pattern(value_vars) + + if not pattern['success']: + raise ValueError(f"❌ 自动检测失败: {pattern['message']}") + + metric_name = metric_name or pattern['common_prefix'] + separator = separator if separator is not None else pattern['separator'] + timepoints = pattern['timepoints'] + + print(f"\n✅ 使用检测结果:", flush=True) + print(f" - 指标名: '{metric_name}'", flush=True) + print(f" - 分隔符: '{separator}'", flush=True) + print(f" - 置信度: {pattern['confidence']:.0%}", flush=True) + else: + print(f"\n✅ 使用手动指定参数:", flush=True) + print(f" - 指标名: '{metric_name}'", flush=True) + print(f" - 分隔符: '{separator}'", flush=True) + + # 手动拆分时间点 + timepoints = [] + for col in value_vars: + if separator and separator in col: + # 移除指标名和分隔符 + remainder = col.replace(metric_name, '', 1).lstrip(separator) + timepoints.append(remainder) + else: + # 直接移除指标名 + remainder = col.replace(metric_name, '', 1) + timepoints.append(remainder.strip()) + + # ==================== 构建结果DataFrame ==================== + print(f"\n🔨 开始构建结果数据...", flush=True) + + result_rows = [] + + for idx, row in df.iterrows(): + result_row = {} + + # 1. 复制ID列 + for id_col in id_vars: + result_row[id_col] = row[id_col] + + # 2. 添加时间点列(实际存储的是指标名) + result_row[timepoint_col_name] = metric_name + + # 3. 添加各个时间点的值作为独立列 + for original_col, timepoint in zip(value_vars, timepoints): + result_row[timepoint] = row[original_col] + + result_rows.append(result_row) + + result_df = pd.DataFrame(result_rows) + + # ==================== 调整列顺序 ==================== + # 顺序:ID列 + 时间点列 + 各时间点列 + column_order = id_vars + [timepoint_col_name] + timepoints + result_df = result_df[column_order] + + # ==================== 统计输出 ==================== + print(f"\n{'='*60}", flush=True) + print(f"✅ 指标-时间表转换完成!", flush=True) + print(f"{'='*60}", flush=True) + print(f"📊 转换结果:", flush=True) + print(f" - 总行数: {len(result_df)} (不变)", flush=True) + print(f" - 总列数: {len(result_df.columns)} (ID列 + 时间点列 + {len(timepoints)}个时间点列)", flush=True) + print(f" - 指标名: {metric_name}", flush=True) + print(f" - 时间点: {', '.join(timepoints[:5])}{'...' if len(timepoints) > 5 else ''}", flush=True) + + # 显示前3行示例 + print(f"\n 前3行数据示例:", flush=True) + for idx, row in result_df.head(3).iterrows(): + row_preview = ' | '.join([f"{col}={row[col]}" for col in result_df.columns[:4]]) + print(f" [{idx}] {row_preview}...", flush=True) + + return result_df + + +def preview_metric_time_transform( + df: pd.DataFrame, + id_vars: List[str], + value_vars: List[str], + preview_rows: int = 5 +) -> Dict[str, Any]: + """ + 预览指标-时间表转换结果(不实际执行完整转换) + + Args: + df: 输入数据框 + id_vars: ID列 + value_vars: 值列 + preview_rows: 预览行数 + + Returns: + { + 'pattern': { + 'common_prefix': str, + 'separator': str, + 'timepoints': List[str], + 'confidence': float + }, + 'original_shape': (rows, cols), + 'new_shape': (rows, cols), + 'preview_data': List[Dict], + 'estimated_change': str + } + """ + # 检测模式 + pattern = detect_common_pattern(value_vars) + + if not pattern['success']: + return { + 'success': False, + 'error': pattern['message'] + } + + # 对前几行执行转换 + preview_df = df.head(preview_rows) + + try: + result_preview = apply_metric_time_transform( + preview_df, + id_vars, + value_vars, + pattern['common_prefix'], + pattern['separator'] + ) + + return { + 'success': True, + 'pattern': pattern, + 'original_shape': (len(df), len(df.columns)), + 'new_shape': (len(df), len(id_vars) + 1 + len(pattern['timepoints'])), + 'preview_data': result_preview.to_dict('records'), + 'estimated_change': f"列数: {len(df.columns)} → {len(id_vars) + 1 + len(pattern['timepoints'])} (ID列 + 时间点列 + {len(pattern['timepoints'])}个时间点列)" + } + except Exception as e: + return { + 'success': False, + 'error': str(e) + } + + +# ==================== 多指标转换(方向1:时间点为行,指标为列)==================== + +def detect_metric_groups( + column_names: List[str], + separators: Optional[List[str]] = None +) -> Dict[str, Any]: + """ + 自动检测并分组多个指标的列 + + 参数: + column_names: 列名列表,例如 ['FMA总得分_基线', 'FMA总得分_随访1', 'ADL总分_基线', 'ADL总分_随访1'] + separators: 可选的分隔符列表,默认 ['___', '__', '_', '-', '.'] + + 返回: + { + 'success': bool, + 'metric_groups': { + 'FMA总得分': ['FMA总得分_基线', 'FMA总得分_随访1', ...], + 'ADL总分': ['ADL总分_基线', 'ADL总分_随访1', ...], + ... + }, + 'separator': str, # 检测到的分隔符 + 'timepoints': ['基线', '随访1', ...], # 所有时间点(应该每个指标都一致) + 'confidence': float, # 置信度 0.0-1.0 + 'message': str + } + """ + print(f"\n🔍 开始自动检测多指标分组...", flush=True) + print(f" 输入列数: {len(column_names)}", flush=True) + + if len(column_names) < 2: + return { + 'success': False, + 'metric_groups': {}, + 'separator': '', + 'timepoints': [], + 'confidence': 0.0, + 'message': '至少需要2列才能检测分组' + } + + if separators is None: + separators = ['___', '__', '_', '-', '.', '|', ' - ', ' '] + + # ==================== 1. 尝试每个分隔符 ==================== + detected_separator = None + metric_groups = defaultdict(list) + + for sep in separators: + temp_groups = defaultdict(list) + failed = False + + for col in column_names: + if sep not in col: + failed = True + break + + # 分割列名 + parts = col.split(sep) + if len(parts) < 2: + failed = True + break + + # 第一部分作为指标名 + metric_name = parts[0] + temp_groups[metric_name].append(col) + + if not failed and len(temp_groups) > 0: + detected_separator = sep + metric_groups = temp_groups + print(f" ✓ 检测到分隔符: '{sep}'", flush=True) + break + + if not detected_separator: + return { + 'success': False, + 'metric_groups': {}, + 'separator': '', + 'timepoints': [], + 'confidence': 0.0, + 'message': '未检测到公共分隔符,请确认选中的列格式一致' + } + + # ==================== 2. 提取每个指标的时间点 ==================== + metric_timepoints = {} + + for metric_name, cols in metric_groups.items(): + timepoints = [] + for col in cols: + # 提取时间点(分隔符后的部分) + parts = col.split(detected_separator) + if len(parts) >= 2: + # 使用最后一部分作为时间点(支持多级分隔,如 "FMA总得分_子项_基线") + timepoint = parts[-1].strip() + timepoints.append(timepoint) + + metric_timepoints[metric_name] = timepoints + + print(f" ✓ 检测到 {len(metric_groups)} 个指标:", flush=True) + for metric_name, cols in metric_groups.items(): + print(f" • {metric_name} ({len(cols)}列)", flush=True) + + # ==================== 3. 验证时间点一致性 ==================== + # 检查所有指标的时间点是否相同 + all_timepoints = list(metric_timepoints.values()) + first_timepoints = all_timepoints[0] + + consistent = True + for tp_list in all_timepoints[1:]: + if tp_list != first_timepoints: + consistent = False + break + + if not consistent: + print(f" ⚠️ 警告: 各指标的时间点不完全一致", flush=True) + # 使用所有时间点的并集 + all_unique_timepoints = sorted(set(tp for tp_list in all_timepoints for tp in tp_list)) + confidence = 0.6 + message = f"检测到{len(metric_groups)}个指标,但时间点不完全一致。将使用所有时间点的并集,缺失值将填充为NA。" + else: + all_unique_timepoints = first_timepoints + confidence = 1.0 + message = f"成功检测到{len(metric_groups)}个指标,共{len(all_unique_timepoints)}个时间点" + + print(f" ✓ 检测到 {len(all_unique_timepoints)} 个时间点:", flush=True) + for i, tp in enumerate(all_unique_timepoints[:5]): + print(f" [{i+1}] {tp}", flush=True) + if len(all_unique_timepoints) > 5: + print(f" ... 还有 {len(all_unique_timepoints) - 5} 个", flush=True) + + # ==================== 4. 计算置信度 ==================== + # 检查:每个指标的列数是否相同 + column_counts = [len(cols) for cols in metric_groups.values()] + if len(set(column_counts)) > 1: + confidence -= 0.2 + print(f" ⚠️ 各指标的列数不同,降低置信度", flush=True) + + return { + 'success': True, + 'metric_groups': dict(metric_groups), + 'separator': detected_separator, + 'timepoints': all_unique_timepoints, + 'confidence': confidence, + 'message': message + } + + +def apply_multi_metric_to_long( + df: pd.DataFrame, + id_vars: List[str], + metric_groups: Dict[str, List[str]], + separator: str, + event_col_name: str = 'Event_Name' +) -> pd.DataFrame: + """ + 多指标转长表:时间点为行,指标为列 + + 参数: + df: 原始数据框 + id_vars: ID列列表 + metric_groups: 指标分组字典,格式 {'FMA总得分': ['FMA总得分_基线', ...], ...} + separator: 分隔符 + event_col_name: 时间点列的列名 + + 返回: + 转换后的数据框 + + 示例: + 输入: + Record_ID | FMA总得分_基线 | FMA总得分_随访1 | ADL总分_基线 | ADL总分_随访1 + 10 | 58 | 67 | 值1 | 值2 + + 输出: + Record_ID | Event_Name | FMA总得分 | ADL总分 + 10 | 基线 | 58 | 值1 + 10 | 随访1 | 67 | 值2 + """ + print(f"\n🔄 开始多指标转长表转换...", flush=True) + print(f" 原始形状: {df.shape}", flush=True) + print(f" ID列: {id_vars}", flush=True) + print(f" 指标数: {len(metric_groups)}", flush=True) + + # ✨ 记录原始行的顺序(保持原始Record ID顺序) + df = df.copy() + df['_original_order'] = range(len(df)) + + # ==================== 1. 对每个指标执行 melt ==================== + melted_dfs = [] + + for metric_name, cols in metric_groups.items(): + print(f" • 处理指标: {metric_name} ({len(cols)}列)", flush=True) + + # 提取该指标的数据(包含原始顺序列) + df_metric = df[id_vars + ['_original_order'] + cols].copy() + + # Melt(保留原始顺序列) + df_melted = df_metric.melt( + id_vars=id_vars + ['_original_order'], + value_vars=cols, + var_name='_temp_col', + value_name=metric_name + ) + + # 提取时间点(移除分隔符前的指标名部分) + df_melted[event_col_name] = df_melted['_temp_col'].apply( + lambda x: x.split(separator)[-1].strip() if separator in x else x + ) + + # 删除临时列 + df_melted = df_melted.drop('_temp_col', axis=1) + + melted_dfs.append(df_melted) + + # ==================== 2. Merge所有指标 ==================== + print(f" • 合并 {len(melted_dfs)} 个指标的数据...", flush=True) + + result = melted_dfs[0] + for i, df_metric in enumerate(melted_dfs[1:], 1): + result = result.merge( + df_metric, + on=id_vars + ['_original_order', event_col_name], + how='outer' # 外连接,保留所有时间点 + ) + + # ==================== 3. 排序 ==================== + # ✨ 按原始顺序和时间点排序(保持原始Record ID顺序) + result = result.sort_values(by=['_original_order', event_col_name]).reset_index(drop=True) + + # 删除临时的原始顺序列 + result = result.drop('_original_order', axis=1) + + # ==================== 4. 调整列顺序 ==================== + # 确保列顺序为:ID列 → Event_Name → 所有指标列 + metric_cols = [col for col in result.columns if col not in id_vars and col != event_col_name] + desired_column_order = id_vars + [event_col_name] + metric_cols + result = result[desired_column_order] + + print(f" ✓ 转换完成!新形状: {result.shape}", flush=True) + print(f" ✓ 列顺序: {list(result.columns)}", flush=True) + + return result + + +def preview_multi_metric_to_long( + df: pd.DataFrame, + id_vars: List[str], + value_vars: List[str], + separators: Optional[List[str]] = None, + event_col_name: str = 'Event_Name', + preview_rows: int = 10 +) -> Dict[str, Any]: + """ + 预览多指标转长表的结果 + + 返回: + { + 'success': bool, + 'grouping': {...}, # detect_metric_groups的结果 + 'original_shape': (rows, cols), + 'new_shape': (rows, cols), + 'preview_data': [...], + 'estimated_change': str + } + """ + print(f"\n📊 预览多指标转长表...", flush=True) + + # 1. 检测分组 + grouping = detect_metric_groups(value_vars, separators) + + if not grouping['success']: + return { + 'success': False, + 'error': grouping['message'] + } + + # 2. 对前几行执行转换 + preview_df = df.head(preview_rows) + + try: + result_preview = apply_multi_metric_to_long( + preview_df, + id_vars, + grouping['metric_groups'], + grouping['separator'], + event_col_name + ) + + num_metrics = len(grouping['metric_groups']) + num_timepoints = len(grouping['timepoints']) + + return { + 'success': True, + 'grouping': grouping, + 'original_shape': (len(df), len(df.columns)), + 'new_shape': (len(df) * num_timepoints, len(id_vars) + 1 + num_metrics), + 'preview_data': result_preview.to_dict('records'), + 'estimated_change': f"行数: {len(df)} → {len(df) * num_timepoints} (每个ID复制{num_timepoints}次); 列数: {len(df.columns)} → {len(id_vars) + 1 + num_metrics} (ID列 + 时间点列 + {num_metrics}个指标列)" + } + except Exception as e: + import traceback + print(f" ❌ 预览失败: {str(e)}", flush=True) + traceback.print_exc() + return { + 'success': False, + 'error': str(e) + } + + +# ==================== 多指标转换(方向2:时间点为列,指标为行)==================== + +def apply_multi_metric_to_matrix( + df: pd.DataFrame, + id_vars: List[str], + metric_groups: Dict[str, List[str]], + separator: str, + event_col_name: str = 'Event_Name', + metric_col_name: str = '指标名' +) -> pd.DataFrame: + """ + 多指标转矩阵格式:时间点为列,指标为行 + + 参数: + df: 原始数据框 + id_vars: ID列列表 + metric_groups: 指标分组字典 + separator: 分隔符 + event_col_name: 时间点列的列名(中间变量) + metric_col_name: 指标列的列名 + + 返回: + 转换后的数据框 + + 示例: + 输入: + Record_ID | FMA总得分_基线 | FMA总得分_随访1 | ADL总分_基线 | ADL总分_随访1 + 10 | 58 | 67 | 值1 | 值2 + + 输出: + Record_ID | 指标名 | 基线 | 随访1 + 10 | FMA总得分 | 58 | 67 + 10 | ADL总分 | 值1 | 值2 + """ + print(f"\n🔄 开始多指标转矩阵格式...", flush=True) + print(f" 原始形状: {df.shape}", flush=True) + print(f" ID列: {id_vars}", flush=True) + print(f" 指标数: {len(metric_groups)}", flush=True) + + # ✨ 记录原始行的顺序(保持原始Record ID顺序) + # 创建ID到原始顺序的映射 + df_with_order = df.copy() + df_with_order['_original_order'] = range(len(df_with_order)) + + # 创建ID列到原始顺序的映射字典 + # 如果有多个ID列,使用元组作为key + if len(id_vars) == 1: + id_to_order = df_with_order.set_index(id_vars[0])['_original_order'].to_dict() + else: + id_to_order = df_with_order.set_index(id_vars)['_original_order'].to_dict() + + # ==================== 1. 先转成长表 ==================== + df_long = apply_multi_metric_to_long( + df, + id_vars, + metric_groups, + separator, + event_col_name + ) + + print(f" • 长表形状: {df_long.shape}", flush=True) + + # ==================== 2. 转成宽格式(指标为行,时间点为列)==================== + # 先melt所有指标列,变成 (ID, Event_Name, 指标名, 值) 格式 + metric_cols = [col for col in df_long.columns if col not in id_vars and col != event_col_name] + + print(f" • 准备pivot: {len(metric_cols)} 个指标列", flush=True) + + # Melt:将所有指标列转为行 + df_melted = df_long.melt( + id_vars=id_vars + [event_col_name], + value_vars=metric_cols, + var_name=metric_col_name, + value_name='_value' + ) + + print(f" • Melt后形状: {df_melted.shape}", flush=True) + + # Pivot:时间点变成列 + # 使用 pivot_table 而不是 pivot,因为可能有重复索引 + result = df_melted.pivot_table( + index=id_vars + [metric_col_name], + columns=event_col_name, + values='_value', + aggfunc='first' # 如果有重复,取第一个值 + ).reset_index() + + # 清理列名(移除多级索引的名称) + result.columns.name = None + + # ✨ 添加原始顺序列(用于排序) + if len(id_vars) == 1: + result['_original_order'] = result[id_vars[0]].map(id_to_order) + else: + # 多个ID列的情况,创建元组作为key + result['_original_order'] = result[id_vars].apply(tuple, axis=1).map(id_to_order) + + # ==================== 3. 调整列顺序 ==================== + # 确保列顺序为:ID列 → 指标名列 → 所有时间点列(按原始顺序) + timepoint_cols = [col for col in result.columns if col not in id_vars and col != metric_col_name] + + # 尝试保持时间点的原始顺序(从 metric_groups 中获取) + first_metric_cols = list(metric_groups.values())[0] + original_timepoint_order = [] + for col in first_metric_cols: + timepoint = col.split(separator)[-1].strip() if separator in col else col + if timepoint not in original_timepoint_order: + original_timepoint_order.append(timepoint) + + # 按原始顺序排列时间点列 + sorted_timepoint_cols = [] + for tp in original_timepoint_order: + if tp in timepoint_cols: + sorted_timepoint_cols.append(tp) + # 添加任何未在原始顺序中的时间点(防御性编程) + for tp in timepoint_cols: + if tp not in sorted_timepoint_cols: + sorted_timepoint_cols.append(tp) + + # ==================== 4. 排序 ==================== + # ✨ 按原始顺序和指标名排序(保持原始Record ID顺序) + result = result.sort_values(by=['_original_order', metric_col_name]).reset_index(drop=True) + + # 删除临时的原始顺序列 + result = result.drop('_original_order', axis=1) + + # ==================== 5. 调整列顺序 ==================== + desired_column_order = id_vars + [metric_col_name] + sorted_timepoint_cols + result = result[desired_column_order] + + print(f" ✓ 转换完成!新形状: {result.shape}", flush=True) + print(f" ✓ 列顺序: {list(result.columns)}", flush=True) + + return result + + +def preview_multi_metric_to_matrix( + df: pd.DataFrame, + id_vars: List[str], + value_vars: List[str], + separators: Optional[List[str]] = None, + metric_col_name: str = '指标名', + preview_rows: int = 10 +) -> Dict[str, Any]: + """ + 预览多指标转矩阵格式的结果 + + 返回: + { + 'success': bool, + 'grouping': {...}, # detect_metric_groups的结果 + 'original_shape': (rows, cols), + 'new_shape': (rows, cols), + 'preview_data': [...], + 'estimated_change': str + } + """ + print(f"\n📊 预览多指标转矩阵格式...", flush=True) + + # 1. 检测分组 + grouping = detect_metric_groups(value_vars, separators) + + if not grouping['success']: + return { + 'success': False, + 'error': grouping['message'] + } + + # 2. 对前几行执行转换 + preview_df = df.head(preview_rows) + + try: + result_preview = apply_multi_metric_to_matrix( + preview_df, + id_vars, + grouping['metric_groups'], + grouping['separator'], + 'Event_Name', + metric_col_name + ) + + num_metrics = len(grouping['metric_groups']) + num_timepoints = len(grouping['timepoints']) + + # 新行数 = 原始行数 × 指标数 + estimated_new_rows = len(df) * num_metrics + # 新列数 = ID列数 + 1(指标名列)+ 时间点数 + estimated_new_cols = len(id_vars) + 1 + num_timepoints + + return { + 'success': True, + 'grouping': grouping, + 'original_shape': (len(df), len(df.columns)), + 'new_shape': (estimated_new_rows, estimated_new_cols), + 'preview_data': result_preview.to_dict('records'), + 'estimated_change': f"行数: {len(df)} → {estimated_new_rows} (每个ID复制{num_metrics}次,每个指标1行); 列数: {len(df.columns)} → {estimated_new_cols} (ID列 + 指标名列 + {num_timepoints}个时间点列)" + } + except Exception as e: + import traceback + print(f" ❌ 预览失败: {str(e)}", flush=True) + traceback.print_exc() + return { + 'success': False, + 'error': str(e) + } + diff --git a/extraction_service/operations/unpivot.py b/extraction_service/operations/unpivot.py new file mode 100644 index 00000000..ceeb7343 --- /dev/null +++ b/extraction_service/operations/unpivot.py @@ -0,0 +1,289 @@ +""" +宽表转长表(Unpivot/Melt)操作 + +提供数据重塑功能,将宽格式转换为长格式。 +典型医学场景: +- 多时间点随访数据(FMA_基线、FMA_2周 → 时间点列 + FMA值列) +- 多指标合并分析(收缩压、舒张压 → 指标列 + 测量值列) +- 治疗组对比(治疗组_NRS、对照组_NRS → 组别列 + NRS列) +""" + +import pandas as pd +import numpy as np +from typing import List, Optional, Dict, Any +import sys + + +def apply_unpivot( + df: pd.DataFrame, + id_vars: List[str], + value_vars: List[str], + var_name: str = '变量', + value_name: str = '值', + parse_column_names: bool = False, + separator: str = '_', + metric_name: Optional[str] = None, + time_name: Optional[str] = None, + dropna: bool = False +) -> pd.DataFrame: + """ + 应用宽表转长表转换 + + Args: + df: 输入数据框 + id_vars: ID列(保持不变的列) + value_vars: 值列(需要转换的列) + var_name: 变量名列名(存储原列名) + value_name: 值列名(存储实际值) + parse_column_names: 是否解析列名(如"FMA_基线"→"FMA"+"基线") + separator: 列名分隔符 + metric_name: 指标列名(解析列名时使用) + time_name: 时间列名(解析列名时使用) + dropna: 是否删除缺失值行 + + Returns: + 转换后的长格式数据框 + + Examples: + >>> # 场景1:多时间点随访数据 + >>> df = pd.DataFrame({ + ... '患者ID': ['P001', 'P002'], + ... '性别': ['男', '女'], + ... 'FMA_基线': [32, 28], + ... 'FMA_2周': [45, 38], + ... 'FMA_1月': [52, 44] + ... }) + >>> result = apply_unpivot( + ... df, + ... id_vars=['患者ID', '性别'], + ... value_vars=['FMA_基线', 'FMA_2周', 'FMA_1月'], + ... var_name='时间点', + ... value_name='FMA值' + ... ) + >>> len(result) # 2人 × 3个时间点 = 6行 + 6 + >>> result.columns.tolist() + ['患者ID', '性别', '时间点', 'FMA值'] + + >>> # 场景2:带列名解析 + >>> result = apply_unpivot( + ... df, + ... id_vars=['患者ID', '性别'], + ... value_vars=['FMA_基线', 'FMA_2周', 'FMA_1月'], + ... parse_column_names=True, + ... separator='_', + ... metric_name='指标', + ... time_name='时间点', + ... value_name='测量值' + ... ) + >>> result.columns.tolist() + ['患者ID', '性别', '指标', '时间点', '测量值'] + >>> result['指标'].unique().tolist() + ['FMA'] + >>> result['时间点'].unique().tolist() + ['基线', '2周', '1月'] + """ + print("\n" + "="*60, flush=True) + print("🔄 开始宽表转长表转换...", flush=True) + print("="*60, flush=True) + + # ==================== 参数验证 ==================== + + if df.empty: + print("⚠️ 输入数据框为空", flush=True) + return df + + if not id_vars: + raise ValueError('❌ 至少需要选择1个ID列(标识列)') + + if len(value_vars) < 2: + raise ValueError('❌ 至少需要选择2个值列(需要转换的列)') + + # 验证列是否存在 + missing_id_cols = [col for col in id_vars if col not in df.columns] + if missing_id_cols: + raise KeyError(f"❌ ID列不存在: {', '.join(missing_id_cols)}") + + missing_value_cols = [col for col in value_vars if col not in df.columns] + if missing_value_cols: + raise KeyError(f"❌ 值列不存在: {', '.join(missing_value_cols)}") + + # 检查ID列和值列是否有重复 + overlap = set(id_vars) & set(value_vars) + if overlap: + raise ValueError(f"❌ ID列和值列不能重复: {', '.join(overlap)}") + + print(f"\n📊 转换前数据概况:", flush=True) + print(f" - 总行数: {len(df)}", flush=True) + print(f" - 总列数: {len(df.columns)}", flush=True) + print(f" - ID列: {len(id_vars)} 个 ({', '.join(id_vars[:3])}{'...' if len(id_vars) > 3 else ''})", flush=True) + print(f" - 值列: {len(value_vars)} 个 ({', '.join(value_vars[:3])}{'...' if len(value_vars) > 3 else ''})", flush=True) + + # ==================== 基础转换(使用pandas.melt)==================== + + try: + result = pd.melt( + df, + id_vars=id_vars, + value_vars=value_vars, + var_name=var_name, + value_name=value_name + ) + + print(f"\n✅ 基础转换完成:", flush=True) + print(f" - 转换后行数: {len(result)} (原 {len(df)} × {len(value_vars)})", flush=True) + print(f" - 转换后列数: {len(result.columns)} (ID列 + 变量名列 + 值列)", flush=True) + + except Exception as e: + print(f"❌ 转换失败: {str(e)}", flush=True) + raise + + # ==================== 高级功能:解析列名 ==================== + + if parse_column_names and separator: + print(f"\n🔍 开始解析列名(分隔符: '{separator}')...", flush=True) + + def parse_column_name(name: str): + """ + 解析列名 + + Examples: + "FMA_基线" → ("FMA", "基线") + "血压_1月" → ("血压", "1月") + "NRS_治疗组_2周" → ("NRS", "治疗组_2周") + """ + parts = name.split(separator) + if len(parts) >= 2: + metric = parts[0] + time = separator.join(parts[1:]) + return metric, time + else: + # 没有分隔符,整个作为指标名,时间点留空 + return name, '' + + try: + # 应用解析函数 + parsed = result[var_name].apply(parse_column_name) + + # 创建新列 + metric_col = metric_name or '指标' + time_col = time_name or '时间点' + + result[metric_col] = parsed.str[0] + result[time_col] = parsed.str[1] + + # 删除原变量名列(已经拆分了) + result = result.drop(columns=[var_name]) + + # 统计解析结果 + unique_metrics = result[metric_col].nunique() + unique_times = result[time_col].nunique() + + print(f"✅ 列名解析完成:", flush=True) + print(f" - {metric_col}列: {unique_metrics} 个唯一值", flush=True) + print(f" - {time_col}列: {unique_times} 个唯一值", flush=True) + + # 显示前3个解析示例 + sample_original = value_vars[:3] + print(f"\n 解析示例:", flush=True) + for orig in sample_original: + metric, time = parse_column_name(orig) + print(f" - '{orig}' → {metric_col}='{metric}', {time_col}='{time}'", flush=True) + + except Exception as e: + print(f"⚠️ 列名解析失败: {str(e)}", flush=True) + print(f" 已保留原变量名列: {var_name}", flush=True) + + # ==================== 删除缺失值行 ==================== + + if dropna: + original_len = len(result) + result = result.dropna(subset=[value_name]) + dropped = original_len - len(result) + + if dropped > 0: + print(f"\n🗑️ 删除缺失值行: {dropped} 行 ({dropped/original_len*100:.1f}%)", flush=True) + + # ==================== 排序 ==================== + + # 排序:按ID列排序(保持患者分组) + result = result.sort_values(id_vars).reset_index(drop=True) + + print(f"\n✅ 排序完成: 按 {', '.join(id_vars[:2])}{'...' if len(id_vars) > 2 else ''} 排序", flush=True) + + # ==================== 最终统计 ==================== + + print(f"\n{'='*60}", flush=True) + print(f"✅ 宽表转长表转换完成!", flush=True) + print(f"{'='*60}", flush=True) + print(f"📊 最终数据:", flush=True) + print(f" - 总行数: {len(result)} (扩展了 {len(result)/len(df):.1f}x)", flush=True) + print(f" - 总列数: {len(result.columns)}", flush=True) + print(f" - 列名: {', '.join(result.columns.tolist())}", flush=True) + + # 显示前3行示例 + print(f"\n 前3行数据示例:", flush=True) + for idx, row in result.head(3).iterrows(): + row_str = ' | '.join([f"{col}={row[col]}" for col in result.columns[:4]]) + print(f" [{idx}] {row_str}...", flush=True) + + return result + + +def get_unpivot_preview( + df: pd.DataFrame, + id_vars: List[str], + value_vars: List[str], + var_name: str = '变量', + value_name: str = '值', + preview_rows: int = 10 +) -> Dict[str, Any]: + """ + 获取转换预览信息(不实际执行完整转换) + + Args: + df: 输入数据框 + id_vars: ID列 + value_vars: 值列 + var_name: 变量名列名 + value_name: 值列名 + preview_rows: 预览行数 + + Returns: + { + 'original_shape': (rows, cols), + 'new_shape': (rows, cols), + 'expansion_factor': 扩展倍数, + 'preview_data': 前N行数据, + 'estimated_change': '将从 100 行 × 15 列 转换为 500 行 × 5 列' + } + """ + original_rows = len(df) + original_cols = len(df.columns) + + # 预估转换后的形状 + new_rows = original_rows * len(value_vars) + new_cols = len(id_vars) + 2 # ID列 + 变量名列 + 值列 + + expansion_factor = len(value_vars) + + # 生成前几行预览 + preview_df = df.head(min(3, len(df))) + preview_result = pd.melt( + preview_df, + id_vars=id_vars, + value_vars=value_vars, + var_name=var_name, + value_name=value_name + ) + + return { + 'original_shape': (original_rows, original_cols), + 'new_shape': (new_rows, new_cols), + 'expansion_factor': expansion_factor, + 'preview_data': preview_result.head(preview_rows).to_dict('records'), + 'estimated_change': f"将从 {original_rows} 行 × {original_cols} 列 转换为 {new_rows} 行 × {new_cols} 列" + } + + + diff --git a/extraction_service/test_dc_api.py b/extraction_service/test_dc_api.py index 35ce8ab8..9863f6fb 100644 --- a/extraction_service/test_dc_api.py +++ b/extraction_service/test_dc_api.py @@ -291,3 +291,9 @@ if __name__ == "__main__": + + + + + + diff --git a/extraction_service/test_execute_simple.py b/extraction_service/test_execute_simple.py index 08e1b287..78c54a29 100644 --- a/extraction_service/test_execute_simple.py +++ b/extraction_service/test_execute_simple.py @@ -57,3 +57,9 @@ except Exception as e: + + + + + + diff --git a/extraction_service/test_module.py b/extraction_service/test_module.py index 582175d3..18b0affb 100644 --- a/extraction_service/test_module.py +++ b/extraction_service/test_module.py @@ -37,3 +37,9 @@ except Exception as e: + + + + + + diff --git a/frontend-v2/src/modules/asl/components/FulltextDetailDrawer.tsx b/frontend-v2/src/modules/asl/components/FulltextDetailDrawer.tsx index b5cfdf03..2e467b75 100644 --- a/frontend-v2/src/modules/asl/components/FulltextDetailDrawer.tsx +++ b/frontend-v2/src/modules/asl/components/FulltextDetailDrawer.tsx @@ -519,6 +519,12 @@ export default FulltextDetailDrawer; + + + + + + diff --git a/frontend-v2/src/modules/asl/hooks/useFulltextResults.ts b/frontend-v2/src/modules/asl/hooks/useFulltextResults.ts index 8958c766..1cfb1168 100644 --- a/frontend-v2/src/modules/asl/hooks/useFulltextResults.ts +++ b/frontend-v2/src/modules/asl/hooks/useFulltextResults.ts @@ -118,6 +118,12 @@ export function useFulltextResults({ + + + + + + diff --git a/frontend-v2/src/modules/asl/hooks/useFulltextTask.ts b/frontend-v2/src/modules/asl/hooks/useFulltextTask.ts index 5cc5de37..c5620b6a 100644 --- a/frontend-v2/src/modules/asl/hooks/useFulltextTask.ts +++ b/frontend-v2/src/modules/asl/hooks/useFulltextTask.ts @@ -81,6 +81,12 @@ export function useFulltextTask({ + + + + + + diff --git a/frontend-v2/src/modules/asl/pages/FulltextResults.tsx b/frontend-v2/src/modules/asl/pages/FulltextResults.tsx index 3b9986f4..a0fa79c9 100644 --- a/frontend-v2/src/modules/asl/pages/FulltextResults.tsx +++ b/frontend-v2/src/modules/asl/pages/FulltextResults.tsx @@ -472,6 +472,12 @@ export default FulltextResults; + + + + + + diff --git a/frontend-v2/src/modules/dc/hooks/useAssets.ts b/frontend-v2/src/modules/dc/hooks/useAssets.ts index a92408c3..1485e327 100644 --- a/frontend-v2/src/modules/dc/hooks/useAssets.ts +++ b/frontend-v2/src/modules/dc/hooks/useAssets.ts @@ -116,6 +116,12 @@ export const useAssets = (activeTab: AssetTabType) => { + + + + + + diff --git a/frontend-v2/src/modules/dc/hooks/useRecentTasks.ts b/frontend-v2/src/modules/dc/hooks/useRecentTasks.ts index 493eb524..5051188d 100644 --- a/frontend-v2/src/modules/dc/hooks/useRecentTasks.ts +++ b/frontend-v2/src/modules/dc/hooks/useRecentTasks.ts @@ -106,6 +106,12 @@ export const useRecentTasks = () => { + + + + + + diff --git a/frontend-v2/src/modules/dc/pages/tool-c/components/BinningDialog_improved.tsx b/frontend-v2/src/modules/dc/pages/tool-c/components/BinningDialog_improved.tsx index c58fa94e..c8f793bd 100644 --- a/frontend-v2/src/modules/dc/pages/tool-c/components/BinningDialog_improved.tsx +++ b/frontend-v2/src/modules/dc/pages/tool-c/components/BinningDialog_improved.tsx @@ -345,3 +345,9 @@ export default BinningDialog; + + + + + + diff --git a/frontend-v2/src/modules/dc/pages/tool-c/components/DropnaDialog.tsx b/frontend-v2/src/modules/dc/pages/tool-c/components/DropnaDialog.tsx index 0e09c7d7..5add7074 100644 --- a/frontend-v2/src/modules/dc/pages/tool-c/components/DropnaDialog.tsx +++ b/frontend-v2/src/modules/dc/pages/tool-c/components/DropnaDialog.tsx @@ -308,3 +308,9 @@ export default DropnaDialog; + + + + + + diff --git a/frontend-v2/src/modules/dc/pages/tool-c/components/MetricTimePanel.tsx b/frontend-v2/src/modules/dc/pages/tool-c/components/MetricTimePanel.tsx new file mode 100644 index 00000000..32da6d80 --- /dev/null +++ b/frontend-v2/src/modules/dc/pages/tool-c/components/MetricTimePanel.tsx @@ -0,0 +1,401 @@ +/** + * 指标-时间表转换面板 + * + * 将多个时间点列转换为"指标行+时间点列"格式 + * 典型场景:制作临床研究Table 1,横向对比同一指标的时间变化 + */ + +import React, { useState, useEffect } from 'react'; +import { Button, Alert, Checkbox, App, Input, Spin, Tag } from 'antd'; +import { ArrowLeftRight, Info, Sparkles } from 'lucide-react'; + +interface Props { + columns: Array<{ id: string; name: string }>; + sessionId: string | null; + onApply: (newData: any[]) => void; + onClose: () => void; +} + +interface DetectedPattern { + common_prefix: string; + separator: string; + timepoints: string[]; + confidence: number; + message: string; +} + +const MetricTimePanel: React.FC = ({ + columns, + sessionId, + onApply, + onClose, +}) => { + const { message } = App.useApp(); + const [idVars, setIdVars] = useState([]); + const [valueVars, setValueVars] = useState([]); + const [loading, setLoading] = useState(false); + const [detecting, setDetecting] = useState(false); + + // 检测结果 + const [pattern, setPattern] = useState(null); + const [metricName, setMetricName] = useState(''); + const [separator, setSeparator] = useState(''); + const [timepointColName, setTimepointColName] = useState('时间点'); + + // 当值列变化时,自动检测模式 + useEffect(() => { + if (valueVars.length >= 2) { + detectPattern(); + } else { + setPattern(null); + setMetricName(''); + setSeparator(''); + } + }, [valueVars]); + + // 自动检测模式 + const detectPattern = async () => { + setDetecting(true); + + try { + const response = await fetch('/api/v1/dc/tool-c/metric-time/detect', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + sessionId, + valueVars, + }), + }); + + const result = await response.json(); + + if (result.success && result.pattern) { + setPattern(result.pattern); + setMetricName(result.pattern.common_prefix || ''); + setSeparator(result.pattern.separator || ''); + + if (result.pattern.confidence >= 0.8) { + message.success(`自动检测成功!置信度: ${(result.pattern.confidence * 100).toFixed(0)}%`); + } else if (result.pattern.confidence >= 0.5) { + message.warning(`检测成功但置信度较低 (${(result.pattern.confidence * 100).toFixed(0)}%),建议检查结果`); + } else { + message.warning('检测置信度较低,建议手动调整参数'); + } + } else { + message.error(result.error || '模式检测失败'); + } + } catch (error: any) { + message.error('检测失败:' + error.message); + } finally { + setDetecting(false); + } + }; + + // 执行转换 + const handleApply = async () => { + if (!sessionId) { + message.error('Session ID不存在'); + return; + } + + // 验证 + if (idVars.length === 0) { + message.warning('请至少选择1个ID列'); + return; + } + + if (valueVars.length < 2) { + message.warning('请至少选择2个值列'); + return; + } + + // 验证:ID列和值列不能重复 + const overlap = idVars.filter(id => valueVars.includes(id)); + if (overlap.length > 0) { + message.error(`ID列和值列不能重复:${overlap.join(', ')}`); + return; + } + + if (!metricName.trim()) { + message.warning('请输入指标名称'); + return; + } + + if (!timepointColName.trim()) { + message.warning('请输入时间点列名'); + return; + } + + setLoading(true); + + try { + const response = await fetch('/api/v1/dc/tool-c/quick-action', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + sessionId, + action: 'metric_time', + params: { + idVars, + valueVars, + metricName, + separator: separator || undefined, + timepointColName, + }, + }), + }); + + const result = await response.json(); + + if (!result.success) { + throw new Error(result.error || '指标-时间表转换失败'); + } + + message.success('指标-时间表转换成功!'); + + // 更新父组件数据 + if (result.data?.newDataPreview) { + onApply(result.data.newDataPreview); + } + + // 成功后关闭 + onClose(); + } catch (error: any) { + message.error(error.message || '执行失败'); + } finally { + setLoading(false); + } + }; + + // 获取时间点预览(从pattern中) + const timepoints = pattern?.timepoints || []; + const timepointsSample = timepoints.slice(0, 5); + const hasMoreTimepoints = timepoints.length > 5; + + return ( +
+ {/* 说明 */} + +
• 将多个时间点列转换为"指标行+时间点列"格式
+
• 典型场景:制作临床研究Table 1,横向对比同一指标的时间变化
+
• 示例:FMA___基线、FMA___2周 → 时间点列(FMA)+ 基线列 + 2周列
+
+ } + type="info" + showIcon + icon={} + className="mb-4" + /> + + {/* 第1步:ID列 */} +
+ +
+ + {columns.map((col) => ( + + {col.name} + + ))} + +
+
+ 这些列的值在转换后保持不变(如:Record_ID、性别、年龄) +
+
+ + {/* 第2步:值列 */} +
+ +
+ + {columns.map((col) => ( + + {col.name} + + ))} + +
+
+ 至少选择2列。这些列应该属于同一个指标的不同时间点(已选择 {valueVars.length} 列) +
+
+ + {/* 第3步:自动检测结果 */} + {valueVars.length >= 2 && ( +
+
+ + 第3步:自动检测分析 + {detecting && } +
+ + {pattern ? ( +
+ {/* 检测结果 */} +
+
+
+ ✓ 检测成功 +
+ = 0.8 ? 'green' : pattern.confidence >= 0.5 ? 'orange' : 'red'}> + 置信度: {(pattern.confidence * 100).toFixed(0)}% + +
+
+
• 检测到 {timepoints.length} 个时间点
+
• {pattern.message}
+
+
+ + {/* 指标名称(可编辑) */} +
+ + setMetricName(e.target.value)} + maxLength={100} + suffix={ + 🖊️ 可编辑 + } + /> +
+ 自动检测到的指标名,可手动修改 +
+
+ + {/* 分隔符(可编辑) */} +
+ + setSeparator(e.target.value)} + maxLength={10} + suffix={ + + {separator ? `"${separator}"` : '无'} + + } + /> +
+ 自动检测到的分隔符({separator ? `"${separator}"` : '未检测到'}),可手动修改 +
+
+ + {/* 时间点列名 */} +
+ + setTimepointColName(e.target.value)} + maxLength={30} + /> +
+ 新增列的列名,用于存储指标名称 +
+
+ + {/* 时间点预览 */} +
+
+ 识别到的时间点({timepoints.length}个): +
+
+ {timepointsSample.map((tp, idx) => ( + + {tp} + + ))} + {hasMoreTimepoints && ( + + ... 还有 {timepoints.length - 5} 个 + + )} +
+
+
+ ) : ( +
+ 正在自动检测列名模式... +
+ )} +
+ )} + + {/* 转换结果预览 */} + {pattern && valueVars.length >= 2 && ( + +
转换结果预览:
+
+
• 新增"{timepointColName}"列,值为:{metricName}
+
• 原来的 {valueVars.length} 列 → 转换为 {timepoints.length} 个时间点列
+
• 时间点列名:{timepointsSample.join(', ')}{hasMoreTimepoints ? '...' : ''}
+
+ + } + type="success" + showIcon={false} + className="bg-green-50 border-green-200" + /> + )} + + {/* 警告 */} + +
• 请确保选中的列属于同一个指标的不同时间点
+
• 如果检测结果不准确,可以手动修改指标名称和分隔符
+ + } + type="warning" + showIcon={false} + className="bg-orange-50 border-orange-200" + /> + + {/* 底部按钮 */} +
+ + +
+ + ); +}; + +export default MetricTimePanel; + + diff --git a/frontend-v2/src/modules/dc/pages/tool-c/components/MultiMetricPanel.tsx b/frontend-v2/src/modules/dc/pages/tool-c/components/MultiMetricPanel.tsx new file mode 100644 index 00000000..9735bbb4 --- /dev/null +++ b/frontend-v2/src/modules/dc/pages/tool-c/components/MultiMetricPanel.tsx @@ -0,0 +1,530 @@ +/** + * 多指标转换面板组件 + * + * 功能: + * - 方向1:时间点为行,指标为列(统计分析格式) + * - 方向2:时间点为列,指标为行(展示格式) + * - 自动检测多个指标并分组 + */ + +import React, { useState, useEffect } from 'react'; +import { Form, Select, Button, Alert, Table, Spin, Divider, Space, Card, Tag, message, Radio } from 'antd'; + +const { Option } = Select; + +interface MultiMetricPanelProps { + sessionId: string; + columns: string[]; + onConfirm: (params: any) => void; + onCancel: () => void; +} + +interface MetricGrouping { + success: boolean; + metric_groups?: Record; + separator?: string; + timepoints?: string[]; + confidence?: number; + message?: string; +} + +export const MultiMetricPanel: React.FC = ({ + sessionId, + columns, + onConfirm, + onCancel, +}) => { + const [form] = Form.useForm(); + + const [loading, setLoading] = useState(false); + const [detecting, setDetecting] = useState(false); + const [grouping, setGrouping] = useState(null); + const [previewData, setPreviewData] = useState([]); + const [previewColumns, setPreviewColumns] = useState([]); + const [direction, setDirection] = useState<'to_long' | 'to_matrix'>('to_long'); // 转换方向 + + // 选中的列变化时,自动检测分组 + useEffect(() => { + const valueVars = form.getFieldValue('valueVars'); + if (valueVars && valueVars.length >= 2) { + detectGrouping(valueVars); + } else { + setGrouping(null); + setPreviewData([]); + setPreviewColumns([]); + } + }, []); + + /** + * 自动检测多指标分组 + */ + const detectGrouping = async (valueVars: string[]) => { + if (!valueVars || valueVars.length < 2) { + setGrouping(null); + return; + } + + setDetecting(true); + try { + console.log(`[MultiMetricPanel] 检测多指标分组: ${valueVars.length} 列`); + + const response = await fetch('/api/v1/dc/tool-c/multi-metric/detect', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + sessionId, + valueVars, + }), + }); + + const data = await response.json(); + + if (data.success) { + setGrouping(data.grouping); + console.log(`[MultiMetricPanel] 分组检测成功:`, data.grouping); + + // 自动生成预览 + generatePreview( + form.getFieldValue('idVars') || [], + valueVars, + data.grouping + ); + } else { + console.error(`[MultiMetricPanel] 分组检测失败:`, data.error); + setGrouping({ + success: false, + message: data.error || '分组检测失败' + }); + } + } catch (error: any) { + console.error(`[MultiMetricPanel] 分组检测异常:`, error); + setGrouping({ + success: false, + message: error.message || '分组检测异常' + }); + } finally { + setDetecting(false); + } + }; + + /** + * 生成预览数据 + */ + const generatePreview = async (idVars: string[], valueVars: string[], groupingData: MetricGrouping) => { + if (!groupingData.success || !idVars || idVars.length === 0) { + return; + } + + setLoading(true); + try { + // 根据转换方向调用不同的API + const action = direction === 'to_long' ? 'multi_metric_to_long' : 'multi_metric_to_matrix'; + const params = direction === 'to_long' + ? { + idVars, + valueVars, + eventColName: form.getFieldValue('eventColName') || 'Event_Name', + } + : { + idVars, + valueVars, + metricColName: form.getFieldValue('metricColName') || '指标名', + }; + + // 调用preview API + const response = await fetch('/api/v1/dc/tool-c/quick-action/preview', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + sessionId, + action, + params, + }), + }); + + const data = await response.json(); + + if (data.success) { + const resultData = data.data.newDataPreview || []; + + if (resultData.length > 0) { + // 生成列定义 + const cols = Object.keys(resultData[0]).map((key) => ({ + title: key, + dataIndex: key, + key, + width: 150, + })); + + setPreviewColumns(cols); + setPreviewData(resultData.slice(0, 10)); // 只显示前10行 + } + } else { + console.error(`[MultiMetricPanel] 预览失败:`, data.error); + } + } catch (error: any) { + console.error(`[MultiMetricPanel] 预览异常:`, error); + } finally { + setLoading(false); + } + }; + + /** + * 处理表单提交 + */ + const handleSubmit = async () => { + try { + const values = await form.validateFields(); + + if (!grouping || !grouping.success) { + message.error('请先检测指标分组'); + return; + } + + setLoading(true); + + console.log(`[MultiMetricPanel] 提交多指标转换 (${direction}):`, values); + + // 根据转换方向调用不同的API + const action = direction === 'to_long' ? 'multi_metric_to_long' : 'multi_metric_to_matrix'; + const params = direction === 'to_long' + ? { + idVars: values.idVars, + valueVars: values.valueVars, + eventColName: values.eventColName || 'Event_Name', + } + : { + idVars: values.idVars, + valueVars: values.valueVars, + metricColName: values.metricColName || '指标名', + }; + + // 调用快速操作API执行转换 + const response = await fetch('/api/v1/dc/tool-c/quick-action', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + sessionId, + action, + params, + }), + }); + + const result = await response.json(); + + if (!result.success) { + throw new Error(result.error || '多指标转换失败'); + } + + const successMsg = direction === 'to_long' ? '多指标转长表成功!' : '多指标转矩阵成功!'; + message.success(successMsg); + + // 更新父组件数据 + if (result.data?.newDataPreview) { + onConfirm(result.data.newDataPreview); + } + + // 成功后关闭 + onCancel(); + + } catch (error: any) { + console.error(`[MultiMetricPanel] 转换失败:`, error); + message.error(error.message || '执行失败'); + } finally { + setLoading(false); + } + }; + + /** + * 处理值列变化 + */ + const handleValueVarsChange = (valueVars: string[]) => { + form.setFieldsValue({ valueVars }); + detectGrouping(valueVars); + }; + + /** + * 处理ID列变化 + */ + const handleIdVarsChange = (idVars: string[]) => { + form.setFieldsValue({ idVars }); + + // 重新生成预览 + const valueVars = form.getFieldValue('valueVars'); + if (grouping && grouping.success && valueVars && valueVars.length >= 2) { + generatePreview(idVars, valueVars, grouping); + } + }; + + return ( +
+
+ {/* 0. 转换方向 */} + +
+ 📍 转换方向: +
+ { + setDirection(e.target.value); + // 重新生成预览 + const idVars = form.getFieldValue('idVars'); + const valueVars = form.getFieldValue('valueVars'); + if (grouping && grouping.success && idVars && valueVars && valueVars.length >= 2) { + generatePreview(idVars, valueVars, grouping); + } + }} + style={{ width: '100%' }} + > + + +
+ 时间点→行,指标→列 分析格式 +
+ 适用于:统计分析、混合效应模型、GEE、数据可视化 +
+
+
+ +
+ 时间点→列,指标→行 展示格式 +
+ 适用于:临床报告、数据审查表、CRF核对、单受试者数据审查 +
+
+
+
+
+
+ + {/* 1. 选择ID列 */} + + + + + {/* 2. 选择值列 */} + + 2️⃣ 选择值列(自动分组) + + + + +
+ } + name="valueVars" + rules={[ + { required: true, message: '请选择至少2个值列' }, + { + validator: (_, value) => { + if (value && value.length >= 2) { + return Promise.resolve(); + } + return Promise.reject(new Error('至少需要选择2列才能进行转换')); + }, + }, + ]} + extra="选择多个指标的多个时间点列(例如:FMA总得分_基线、FMA总得分_随访1、ADL总分_基线、ADL总分_随访1)" + > + + + + {/* 3. 自动检测结果 */} + {detecting && ( +
+ +
+ )} + + {!detecting && grouping && ( + + {grouping.success ? ( + +
+ ✓ 检测到 {Object.keys(grouping.metric_groups || {}).length} 个指标: +
+ {Object.entries(grouping.metric_groups || {}).map(([metric, cols]) => ( + + {metric} ({cols.length}列) + + ))} +
+
+ +
+ ✓ 检测到 {grouping.timepoints?.length || 0} 个时间点: +
+ {grouping.timepoints?.slice(0, 5).map((tp) => ( + + {tp} + + ))} + {(grouping.timepoints?.length || 0) > 5 && ( + ... 还有 {(grouping.timepoints?.length || 0) - 5} 个 + )} +
+
+ +
+ ✓ 分隔符: + {grouping.separator || '(无)'} +
+ + {grouping.confidence !== undefined && grouping.confidence < 1.0 && ( + + )} +
+ ) : ( + + )} +
+ )} + + {/* 4. 列名设置 */} + {direction === 'to_long' ? ( + + + + ) : ( + + + + )} + + {/* 5. 预览结果 */} + {previewData.length > 0 && ( + <> + 预览结果(前10行) +
+ + + + )} + + + {/* 操作按钮 */} +
+ + + + +
+ + ); +}; + diff --git a/frontend-v2/src/modules/dc/pages/tool-c/components/PivotPanel.tsx b/frontend-v2/src/modules/dc/pages/tool-c/components/PivotPanel.tsx new file mode 100644 index 00000000..432a3e80 --- /dev/null +++ b/frontend-v2/src/modules/dc/pages/tool-c/components/PivotPanel.tsx @@ -0,0 +1,287 @@ +/** + * Pivot面板:长表→宽表 + * 从PivotDialog.tsx提取,作为TransformDialog的子组件 + */ + +import React, { useState } from 'react'; +import { Select, Button, Alert, Checkbox, Radio, App } from 'antd'; +import { ArrowLeftRight, Info } from 'lucide-react'; + +interface Props { + columns: Array<{ id: string; name: string }>; + sessionId: string | null; + onApply: (newData: any[]) => void; + onClose: () => void; +} + +const PivotPanel: React.FC = ({ + columns, + sessionId, + onApply, + onClose, +}) => { + const { message } = App.useApp(); + const [indexColumn, setIndexColumn] = useState(''); + const [pivotColumn, setPivotColumn] = useState(''); + const [valueColumns, setValueColumns] = useState([]); + const [aggfunc, setAggfunc] = useState<'first' | 'last' | 'mean' | 'sum'>('first'); + const [loading, setLoading] = useState(false); + const [keepUnusedColumns, setKeepUnusedColumns] = useState(false); + const [unusedAggMethod, setUnusedAggMethod] = useState<'first' | 'mode' | 'mean'>('first'); + + // 执行 + const handleApply = async () => { + if (!sessionId) { + message.error('Session ID不存在'); + return; + } + + // 验证 + if (!indexColumn) { + message.warning('请选择索引列(唯一标识列)'); + return; + } + + if (!pivotColumn) { + message.warning('请选择透视列(要变成列名的列)'); + return; + } + + if (valueColumns.length === 0) { + message.warning('请至少选择一个值列'); + return; + } + + setLoading(true); + + try { + const response = await fetch('/api/v1/dc/tool-c/quick-action', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + sessionId, + action: 'pivot', + params: { + indexColumn, + pivotColumn, + valueColumns, + aggfunc, + keepUnusedColumns, + unusedAggMethod, + }, + }), + }); + + const result = await response.json(); + + if (!result.success) { + throw new Error(result.error || 'Pivot转换失败'); + } + + message.success('Pivot转换成功!'); + + // 更新父组件数据 + if (result.data?.newDataPreview) { + onApply(result.data.newDataPreview); + } + + // 成功后关闭 + onClose(); + } catch (error: any) { + message.error(error.message || '执行失败'); + } finally { + setLoading(false); + } + }; + + return ( +
+ {/* 说明 */} + +
• 将"一人多行"的纵向数据转为"一人一行"的横向数据
+
• 典型场景:将随访数据(基线、随访2周、随访1个月)展开为独立列
+
• 示例:Event_Name列的"基线"、"随访2周" → FMA_基线、FMA_随访2周
+
+ } + type="info" + showIcon + icon={} + className="mb-4" + /> + + {/* 索引列 */} +
+ + + (option?.label ?? '').toLowerCase().includes(input.toLowerCase()) + } + options={columns + .filter(col => col.id !== indexColumn) + .map(col => ({ label: col.name, value: col.id }))} + /> +
+ 此列的不同值将成为新的列名后缀 +
+
+ + {/* 值列 */} +
+ +
+ + {columns + .filter(col => col.id !== indexColumn && col.id !== pivotColumn) + .map((col) => ( + + {col.name} + + ))} + +
+
+ 可多选。每个值列会生成多个新列(如:FMA_基线、FMA_随访2周) +
+
+ + {/* 聚合方式 */} +
+ + setAggfunc(e.target.value)}> +
+ +
+ 取第一个值 + (推荐) +
+
+ + 取最后一个值 + + + 求平均值 + + + 求和 + +
+
+
+ + {/* 高级选项 */} +
+
+ ⚙️ 高级选项 +
+ + setKeepUnusedColumns(e.target.checked)} + > + 保留未选择的列 + + + {keepUnusedColumns && ( +
+ + setUnusedAggMethod(e.target.value)} + > +
+ +
+ 取第一个值 + (默认) +
+
+ + 取众数 + + +
+ 取均值 + (仅数值列) +
+
+
+
+
+ )} +
+ + {/* 警告 */} + +
• Pivot操作会显著改变数据结构(行列转换)
+
• 转换后列数可能大幅增加
+ + } + type="warning" + showIcon={false} + className="bg-orange-50 border-orange-200" + /> + + {/* 底部按钮 */} +
+ + +
+ + ); +}; + +export default PivotPanel; + + + diff --git a/frontend-v2/src/modules/dc/pages/tool-c/components/Toolbar.tsx b/frontend-v2/src/modules/dc/pages/tool-c/components/Toolbar.tsx index f8004c38..b31b685c 100644 --- a/frontend-v2/src/modules/dc/pages/tool-c/components/Toolbar.tsx +++ b/frontend-v2/src/modules/dc/pages/tool-c/components/Toolbar.tsx @@ -131,7 +131,7 @@ const Toolbar: React.FC = ({ {/* 高级按钮(Phase 3) */} void; + onApply: (newData: any[]) => void; + columns: Array<{ id: string; name: string }>; + sessionId: string | null; +} + +const TransformDialog: React.FC = ({ + visible, + onClose, + onApply, + columns, + sessionId, +}) => { + const [activeTab, setActiveTab] = useState<'pivot' | 'unpivot' | 'multi_metric'>('pivot'); + + return ( + + + 数据表转换 + + } + open={visible} + onCancel={onClose} + width={750} + footer={null} + destroyOnClose + > + setActiveTab(key as any)} + items={[ + { + key: 'pivot', + label: ( + + + 长→宽表 (Pivot) + + ), + children: ( + + ), + }, + { + key: 'unpivot', + label: ( + + + 宽→长表 (Unpivot) + + ), + children: ( + + ), + }, + { + key: 'multi_metric', + label: ( + + + 多指标转换 + + ), + children: ( + c.name)} + onConfirm={onApply} + onCancel={onClose} + /> + ), + }, + ]} + /> + + ); +}; + +export default TransformDialog; + + diff --git a/frontend-v2/src/modules/dc/pages/tool-c/components/UnpivotPanel.tsx b/frontend-v2/src/modules/dc/pages/tool-c/components/UnpivotPanel.tsx new file mode 100644 index 00000000..ccc7ef59 --- /dev/null +++ b/frontend-v2/src/modules/dc/pages/tool-c/components/UnpivotPanel.tsx @@ -0,0 +1,392 @@ +/** + * Unpivot面板:宽表→长表 + * 新功能:将横向数据转为纵向重复数据 + */ + +import React, { useState } from 'react'; +import { Select, Button, Alert, Checkbox, App, Input, Collapse, Radio } from 'antd'; +import { ArrowLeftRight, Info } from 'lucide-react'; + +interface Props { + columns: Array<{ id: string; name: string }>; + sessionId: string | null; + onApply: (newData: any[]) => void; + onClose: () => void; +} + +const UnpivotPanel: React.FC = ({ + columns, + sessionId, + onApply, + onClose, +}) => { + const { message } = App.useApp(); + const [idVars, setIdVars] = useState([]); + const [valueVars, setValueVars] = useState([]); + const [varName, setVarName] = useState('时间点'); + const [valueName, setValueName] = useState('测量值'); + const [loading, setLoading] = useState(false); + + // 高级选项 + const [parseColumnNames, setParseColumnNames] = useState(false); + const [separator, setSeparator] = useState('_'); + const [metricName, setMetricName] = useState('指标'); + const [timeName, setTimeName] = useState('时间点'); + + // 执行 + const handleApply = async () => { + if (!sessionId) { + message.error('Session ID不存在'); + return; + } + + // 验证 + if (idVars.length === 0) { + message.warning('请至少选择1个ID列'); + return; + } + + if (valueVars.length < 2) { + message.warning('请至少选择2个值列'); + return; + } + + if (!varName.trim()) { + message.warning('请输入变量名列名'); + return; + } + + if (!valueName.trim()) { + message.warning('请输入值列名'); + return; + } + + // 验证:ID列和值列不能重复 + const overlap = idVars.filter(id => valueVars.includes(id)); + if (overlap.length > 0) { + message.error(`ID列和值列不能重复:${overlap.join(', ')}`); + return; + } + + // 高级选项验证 + if (parseColumnNames) { + if (!separator.trim()) { + message.warning('请输入分隔符'); + return; + } + if (!metricName.trim()) { + message.warning('请输入指标列名'); + return; + } + if (!timeName.trim()) { + message.warning('请输入时间列名'); + return; + } + } + + setLoading(true); + + try { + const response = await fetch('/api/v1/dc/tool-c/quick-action', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + sessionId, + action: 'unpivot', + params: { + idVars, + valueVars, + varName, + valueName, + parseColumnNames, + separator, + metricName: parseColumnNames ? metricName : undefined, + timeName: parseColumnNames ? timeName : undefined, + dropna: false, + }, + }), + }); + + const result = await response.json(); + + if (!result.success) { + throw new Error(result.error || 'Unpivot转换失败'); + } + + message.success('Unpivot转换成功!'); + + // 更新父组件数据 + if (result.data?.newDataPreview) { + onApply(result.data.newDataPreview); + } + + // 成功后关闭 + onClose(); + } catch (error: any) { + message.error(error.message || '执行失败'); + } finally { + setLoading(false); + } + }; + + // 示例数据(用于说明) + const selectedValueVarsSample = valueVars.slice(0, 3); + const hasMoreValueVars = valueVars.length > 3; + + return ( +
+ {/* 说明 */} + +
• 将"一人一行"的横向数据转为"一人多行"的纵向数据
+
• 典型场景:多时间点随访数据(FMA_基线、FMA_2周 → 时间点列 + FMA值列)
+
• 示例:FMA_基线、FMA_随访2周 → Event_Name列(基线、随访2周)+ FMA列
+
+ } + type="info" + showIcon + icon={} + className="mb-4" + /> + + {/* ID列 */} +
+ +
+ + {columns.map((col) => ( + + {col.name} + + ))} + +
+
+ 这些列的值在转换后保持不变(如:患者ID、性别、年龄) +
+
+ + {/* 值列 */} +
+ +
+ + {columns.map((col) => ( + + {col.name} + + ))} + +
+
+ 至少选择2列。这些列将被转换为两列:变量名列 + 值列(已选择 {valueVars.length} 列) +
+
+ + {/* 列名设置 */} +
+
+ + setVarName(e.target.value)} + maxLength={30} + /> +
+ 存储原列名 +
+
+ +
+ + setValueName(e.target.value)} + maxLength={30} + /> +
+ 存储实际值 +
+
+
+ + {/* 预览示例 */} + {valueVars.length > 0 && ( + +
转换示例:
+
+ {selectedValueVarsSample.map((varId) => { + const col = columns.find(c => c.id === varId); + return ( +
+ "{col?.name}" → {varName}="{col?.name}", {valueName}=值 +
+ ); + })} + {hasMoreValueVars && ( +
...(还有 {valueVars.length - 3} 列)
+ )} +
+ + } + type="success" + showIcon={false} + className="bg-green-50 border-green-200" + /> + )} + + {/* 高级选项:列名解析 */} + + ⚙️ 高级选项:智能解析列名 + {parseColumnNames && ( + 已启用 + )} + + ), + children: ( +
+ setParseColumnNames(e.target.checked)} + > + 启用列名解析 + + + {parseColumnNames && ( + <> + + 解析列名可以将"FMA_基线"拆分为:"FMA"(指标) + "基线"(时间点) +
+ } + type="info" + showIcon={false} + className="bg-blue-50 border-blue-200" + /> + +
+ + setSeparator(e.target.value)}> + 下划线(_) + 连字符(-) + 点号(.) + +
+ +
+
+ + setMetricName(e.target.value)} + maxLength={30} + /> +
+ +
+ + setTimeName(e.target.value)} + maxLength={30} + /> +
+
+ + {/* 解析示例 */} + {valueVars.length > 0 && ( +
+
解析示例:
+ {selectedValueVarsSample.map((varId) => { + const col = columns.find(c => c.id === varId); + const parts = col?.name.split(separator); + const metric = parts?.[0] || col?.name; + const time = parts?.slice(1).join(separator) || ''; + + return ( +
+ "{col?.name}" → {metricName}="{metric}", {timeName}="{time}" +
+ ); + })} +
+ )} + + )} + + ), + }, + ]} + /> + + {/* 警告 */} + +
• Unpivot操作会扩展数据(行数增加 = 原行数 × 值列数)
+
• 转换后列数会减少,但行数会大幅增加
+ + } + type="warning" + showIcon={false} + className="bg-orange-50 border-orange-200" + /> + + {/* 底部按钮 */} +
+ + +
+ + ); +}; + +export default UnpivotPanel; + + + diff --git a/frontend-v2/src/modules/dc/pages/tool-c/index.tsx b/frontend-v2/src/modules/dc/pages/tool-c/index.tsx index 8793b289..b758eff8 100644 --- a/frontend-v2/src/modules/dc/pages/tool-c/index.tsx +++ b/frontend-v2/src/modules/dc/pages/tool-c/index.tsx @@ -16,7 +16,7 @@ import BinningDialog from './components/BinningDialog'; import ConditionalDialog from './components/ConditionalDialog'; import MissingValueDialog from './components/MissingValueDialog'; import ComputeDialog from './components/ComputeDialog'; -import PivotDialog from './components/PivotDialog'; +import TransformDialog from './components/TransformDialog'; import * as api from '../../api/toolC'; // ==================== 类型定义 ==================== @@ -341,7 +341,7 @@ const ToolC = () => { onApply={handleQuickActionDataUpdate} /> -