feat(dc): Complete Tool C quick action buttons Phase 1-2 - 7 functions
Summary: - Implement 7 quick action functions (filter, recode, binning, conditional, dropna, compute, pivot) - Refactor to pre-written Python functions architecture (stable and secure) - Add 7 Python operations modules with full type hints - Add 7 frontend Dialog components with user-friendly UI - Fix NaN serialization issues and auto type conversion - Update all related documentation Technical Details: - Python: operations/ module (filter.py, recode.py, binning.py, conditional.py, dropna.py, compute.py, pivot.py) - Backend: QuickActionService.ts with 7 execute methods - Frontend: 7 Dialog components with complete validation - Toolbar: Enable 7 quick action buttons Status: Phase 1-2 completed, basic testing passed, ready for further testing
This commit is contained in:
@@ -64,6 +64,14 @@ export class AICodeService {
|
||||
|
||||
// 1. 获取Session信息(数据集元数据)
|
||||
const session = await sessionService.getSession(sessionId);
|
||||
|
||||
// ✨ 2. 判断是否为数据探索问题
|
||||
const isDataExploration = this.isDataExplorationQuery(userMessage);
|
||||
|
||||
if (isDataExploration) {
|
||||
logger.info('[AICodeService] 检测到数据探索问题,直接回答');
|
||||
return this.handleDataExploration(sessionId, session, userMessage);
|
||||
}
|
||||
|
||||
// 2. 构建System Prompt(含10个Few-shot示例)
|
||||
const systemPrompt = this.buildSystemPrompt({
|
||||
@@ -152,11 +160,20 @@ export class AICodeService {
|
||||
}
|
||||
});
|
||||
|
||||
// 4. 如果成功,获取新数据预览(前50行)
|
||||
// 4. 如果成功,保存完整处理结果到OSS并获取预览
|
||||
if (result.success && result.result_data) {
|
||||
const preview = Array.isArray(result.result_data)
|
||||
? result.result_data.slice(0, 50)
|
||||
: result.result_data;
|
||||
|
||||
// ✅ 保存完整的处理结果到OSS(覆盖原文件)
|
||||
try {
|
||||
await sessionService.saveProcessedData(sessionId, result.result_data);
|
||||
logger.info(`[AICodeService] 处理结果已保存到OSS`);
|
||||
} catch (saveError: any) {
|
||||
logger.error(`[AICodeService] 保存处理结果失败: ${saveError.message}`);
|
||||
// 不阻断流程,只记录错误
|
||||
}
|
||||
|
||||
logger.info(`[AICodeService] 代码执行成功`);
|
||||
|
||||
@@ -297,6 +314,115 @@ export class AICodeService {
|
||||
}
|
||||
|
||||
// ==================== 辅助方法 ====================
|
||||
|
||||
/**
|
||||
* ✨ 判断是否为数据探索问题
|
||||
* @private
|
||||
*/
|
||||
private isDataExplorationQuery(message: string): boolean {
|
||||
const explorationKeywords = [
|
||||
// 统计询问
|
||||
'有多少', '多少个', '数量', '统计', '总共', '一共',
|
||||
// 查询类
|
||||
'查看', '显示', '看看', '列出', '什么', '哪些',
|
||||
// 缺失值
|
||||
'缺失值', '空值', 'NA', '缺失率',
|
||||
// 统计指标
|
||||
'平均值', '均值', '中位数', '最大值', '最小值', '标准差', '方差',
|
||||
// 数据类型
|
||||
'数据类型', '类型是', '是什么类型',
|
||||
// 列信息
|
||||
'列名', '有哪些列', '字段名',
|
||||
// 分布
|
||||
'分布', '占比', '比例',
|
||||
];
|
||||
|
||||
// 排除关键词(如果包含这些,说明是数据清洗,不是探索)
|
||||
const cleaningKeywords = [
|
||||
'删除', '去除', '填补', '替换', '转换', '生成', '创建', '修改',
|
||||
'筛选', '过滤', '合并', '拆分', '排序',
|
||||
];
|
||||
|
||||
const hasExplorationKeyword = explorationKeywords.some(kw => message.includes(kw));
|
||||
const hasCleaningKeyword = cleaningKeywords.some(kw => message.includes(kw));
|
||||
|
||||
// 只有当包含探索关键词,且不包含清洗关键词时,才判断为数据探索
|
||||
return hasExplorationKeyword && !hasCleaningKeyword;
|
||||
}
|
||||
|
||||
/**
|
||||
* ✨ 处理数据探索问题(直接回答,不生成代码)
|
||||
* @private
|
||||
*/
|
||||
private async handleDataExploration(
|
||||
sessionId: string,
|
||||
session: any,
|
||||
userMessage: string
|
||||
): Promise<GenerateCodeResult> {
|
||||
try {
|
||||
// 1. 获取缓存的统计信息
|
||||
const stats = session.dataStats || { columnStats: [] };
|
||||
|
||||
// 2. 构建包含统计信息的System Prompt
|
||||
const systemPrompt = `你是数据分析助手。当前数据集的详细统计信息如下:
|
||||
|
||||
**数据集基本信息**
|
||||
- 文件名:${session.fileName}
|
||||
- 总行数:${session.totalRows}
|
||||
- 总列数:${session.totalCols}
|
||||
- 列名:${session.columns.join(', ')}
|
||||
|
||||
**各列详细统计**
|
||||
${(stats.columnStats || []).map((col: any) => `
|
||||
**${col.name}列**
|
||||
- 数据类型:${col.dataType}
|
||||
- 缺失值数量:${col.missingCount} (${col.missingRate})
|
||||
- 唯一值数量:${col.uniqueCount}
|
||||
${col.mean !== undefined ? `- 平均值:${col.mean}` : ''}
|
||||
${col.median !== undefined ? `- 中位数:${col.median}` : ''}
|
||||
${col.min !== undefined ? `- 最小值:${col.min}` : ''}
|
||||
${col.max !== undefined ? `- 最大值:${col.max}` : ''}
|
||||
${col.topValues ? `- 最常见的值:${col.topValues.map((v: any) => `${v.value}(${v.count}次)`).join(', ')}` : ''}
|
||||
`).join('\n')}
|
||||
|
||||
请根据以上统计信息,直接回答用户的问题。注意:
|
||||
1. 直接给出答案,不要生成代码
|
||||
2. 引用具体的统计数字
|
||||
3. 简洁明了
|
||||
`;
|
||||
|
||||
// 3. 调用LLM
|
||||
const llm = LLMFactory.getAdapter('deepseek-v3' as ModelType);
|
||||
const response = await llm.chat([
|
||||
{ role: 'system', content: systemPrompt },
|
||||
{ role: 'user', content: userMessage }
|
||||
], {
|
||||
temperature: 0.3,
|
||||
maxTokens: 500,
|
||||
});
|
||||
|
||||
// 4. 保存消息(没有代码)
|
||||
const messageId = await this.saveMessages(
|
||||
sessionId,
|
||||
session.userId,
|
||||
userMessage,
|
||||
'', // 无代码(传空字符串而非null)
|
||||
response.content
|
||||
);
|
||||
|
||||
logger.info(`[AICodeService] 数据探索回答完成: messageId=${messageId}`);
|
||||
|
||||
return {
|
||||
code: '', // 无代码
|
||||
explanation: response.content,
|
||||
messageId,
|
||||
};
|
||||
} catch (error: any) {
|
||||
logger.error(`[AICodeService] 数据探索处理失败: ${error.message}`);
|
||||
// 如果失败,降级为生成代码模式
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建System Prompt(含10个Few-shot示例)
|
||||
@@ -455,8 +581,55 @@ df = df.sort_values('check_date').drop_duplicates(subset=['patient_id'], keep='l
|
||||
\`\`\`
|
||||
说明: 先按日期排序,再去重保留最后一条(最新)
|
||||
|
||||
## ⚠️ 复杂需求处理策略(重要)
|
||||
|
||||
**如果用户提出包含多个步骤的复杂需求(如:5个以上变量转换、多个筛选条件、复杂分组等),请遵循以下策略:**
|
||||
|
||||
### 策略1:主动建议拆分(推荐)
|
||||
当检测到复杂需求时,**直接在explanation中建议用户分步骤执行**,而非生成一次性代码。
|
||||
|
||||
**示例响应:**
|
||||
\`\`\`json
|
||||
{
|
||||
"code": "",
|
||||
"explanation": "您的需求包含10个步骤,建议分步骤执行以确保准确性:\\n\\n**第1步**:变量重编码(研究中心、婚姻状况、针刺选穴组方等分类变量)\\n**第2步**:严重不良事件记录处理(转移至基线数据行)\\n**第3步**:新增暴露分组列(根据督脉针刺持续时间)\\n**第4步**:新增不同暴露强度分组\\n**第5步**:纵向数据转横向(FMA、ADL、NLR、PLR评分)\\n**第6步**:列名清理(去除括号内容)\\n\\n💡 **建议**:请先告诉我您想从哪一步开始,我会为每一步生成专门的代码。这样更容易调试和验证结果。"
|
||||
}
|
||||
\`\`\`
|
||||
|
||||
### 策略2:生成第一步代码(如果用户坚持)
|
||||
如果用户明确要求一次性处理,只生成**前1-2个最基础的步骤**,并在explanation中说明:
|
||||
|
||||
**示例:**
|
||||
\`\`\`python
|
||||
# 第1步:变量重编码 - 研究中心
|
||||
try:
|
||||
center_mapping = {
|
||||
'黑龙江中医药大学附属第二医院': 1,
|
||||
'山东中医药大学附属医院': 2,
|
||||
'广州中医药大学附属第一医院': 3
|
||||
}
|
||||
df['研究中心_编码'] = df['研究中心:'].map(center_mapping)
|
||||
print(f'研究中心编码完成,缺失值: {df["研究中心_编码"].isna().sum()}')
|
||||
except Exception as e:
|
||||
print(f'编码错误: {e}')
|
||||
|
||||
# 第2步:婚姻状况编码
|
||||
try:
|
||||
df['婚姻状况_编码'] = df['婚姻状况'].apply(lambda x: 1 if x == '已婚' else 2)
|
||||
print(f'婚姻状况编码完成')
|
||||
except Exception as e:
|
||||
print(f'编码错误: {e}')
|
||||
\`\`\`
|
||||
说明: 已完成前2个变量的重编码。请确认结果无误后,再继续后续步骤(针刺选穴组方、严重不良事件等)。
|
||||
|
||||
### 策略3:检测列名冲突
|
||||
**重要**:如果列名中包含括号、冒号等标点符号(如"研究中心:"、"性别(男=1,女=0)"),需要:
|
||||
1. 先确认实际列名(使用df.columns.tolist()检查)
|
||||
2. 使用精确列名进行操作
|
||||
3. 建议用户先执行"列名清理"步骤
|
||||
|
||||
## 用户当前请求
|
||||
请根据以上示例和当前数据集信息,生成代码并解释。返回JSON格式:{"code": "...", "explanation": "..."}`;
|
||||
请根据以上示例和当前数据集信息,生成代码并解释。**如果需求复杂(>3个步骤),请主动建议拆分。** 返回JSON格式:{"code": "...", "explanation": "..."}`;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
324
backend/src/modules/dc/tool-c/services/QuickActionService.ts
Normal file
324
backend/src/modules/dc/tool-c/services/QuickActionService.ts
Normal file
@@ -0,0 +1,324 @@
|
||||
/**
|
||||
* 快速操作服务
|
||||
*
|
||||
* 功能:调用Python微服务的预写函数API
|
||||
*
|
||||
* @module QuickActionService
|
||||
*/
|
||||
|
||||
import { logger } from '../../../../common/logging/index.js';
|
||||
import axios from 'axios';
|
||||
|
||||
const PYTHON_SERVICE_URL = process.env.EXTRACTION_SERVICE_URL || 'http://localhost:8000';
|
||||
|
||||
// ==================== 类型定义 ====================
|
||||
|
||||
interface FilterParams {
|
||||
conditions: Array<{
|
||||
column: string;
|
||||
operator: string;
|
||||
value?: any;
|
||||
}>;
|
||||
logic: 'and' | 'or';
|
||||
}
|
||||
|
||||
interface RecodeParams {
|
||||
column: string;
|
||||
mapping: Record<string, any>;
|
||||
createNewColumn: boolean;
|
||||
newColumnName?: string;
|
||||
}
|
||||
|
||||
interface BinningParams {
|
||||
column: string;
|
||||
method: 'custom' | 'equal_width' | 'equal_freq';
|
||||
newColumnName: string;
|
||||
bins?: number[];
|
||||
labels?: (string | number)[];
|
||||
numBins?: number;
|
||||
}
|
||||
|
||||
interface ConditionalParams {
|
||||
newColumnName: string;
|
||||
rules: Array<{
|
||||
conditions: Array<{
|
||||
column: string;
|
||||
operator: string;
|
||||
value: any;
|
||||
}>;
|
||||
logic: 'and' | 'or';
|
||||
result: any;
|
||||
}>;
|
||||
elseValue?: any;
|
||||
}
|
||||
|
||||
interface DropnaParams {
|
||||
method: 'row' | 'column' | 'both';
|
||||
threshold?: number;
|
||||
subset?: string[];
|
||||
}
|
||||
|
||||
interface ComputeParams {
|
||||
newColumnName: string;
|
||||
formula: string;
|
||||
}
|
||||
|
||||
interface PivotParams {
|
||||
indexColumn: string;
|
||||
pivotColumn: string;
|
||||
valueColumns: string[];
|
||||
aggfunc: 'first' | 'last' | 'mean' | 'sum' | 'min' | 'max';
|
||||
}
|
||||
|
||||
interface OperationResult {
|
||||
success: boolean;
|
||||
result_data?: any[];
|
||||
output?: string;
|
||||
execution_time?: number;
|
||||
result_shape?: [number, number];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
// ==================== 服务类 ====================
|
||||
|
||||
export class QuickActionService {
|
||||
|
||||
/**
|
||||
* 执行高级筛选
|
||||
*/
|
||||
async executeFilter(data: any[], params: FilterParams): Promise<OperationResult> {
|
||||
try {
|
||||
logger.info(`[QuickActionService] 调用筛选API: ${params.conditions.length}个条件`);
|
||||
|
||||
const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/filter`, {
|
||||
data,
|
||||
conditions: params.conditions,
|
||||
logic: params.logic,
|
||||
}, {
|
||||
timeout: 60000, // 60秒超时
|
||||
});
|
||||
|
||||
logger.info(`[QuickActionService] 筛选成功: ${response.data.result_shape?.[0] || 0} 行`);
|
||||
return response.data;
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(`[QuickActionService] 筛选失败: ${error.message}`);
|
||||
|
||||
if (error.response?.data) {
|
||||
return error.response.data;
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error.message || '筛选失败',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 执行数值映射(重编码)
|
||||
*/
|
||||
async executeRecode(data: any[], params: RecodeParams): Promise<OperationResult> {
|
||||
try {
|
||||
logger.info(`[QuickActionService] 调用重编码API: ${params.column}`);
|
||||
|
||||
const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/recode`, {
|
||||
data,
|
||||
column: params.column,
|
||||
mapping: params.mapping,
|
||||
create_new_column: params.createNewColumn,
|
||||
new_column_name: params.newColumnName,
|
||||
}, {
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
logger.info(`[QuickActionService] 重编码成功`);
|
||||
return response.data;
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(`[QuickActionService] 重编码失败: ${error.message}`);
|
||||
|
||||
if (error.response?.data) {
|
||||
return error.response.data;
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error.message || '重编码失败',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 执行分箱
|
||||
*/
|
||||
async executeBinning(data: any[], params: BinningParams): Promise<OperationResult> {
|
||||
try {
|
||||
logger.info(`[QuickActionService] 调用分箱API: ${params.column}, 方法=${params.method}`);
|
||||
|
||||
const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/binning`, {
|
||||
data,
|
||||
column: params.column,
|
||||
method: params.method,
|
||||
new_column_name: params.newColumnName,
|
||||
bins: params.bins,
|
||||
labels: params.labels,
|
||||
num_bins: params.numBins,
|
||||
}, {
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
logger.info(`[QuickActionService] 分箱成功`);
|
||||
return response.data;
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(`[QuickActionService] 分箱失败: ${error.message}`);
|
||||
|
||||
if (error.response?.data) {
|
||||
return error.response.data;
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error.message || '分箱失败',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 执行条件生成列
|
||||
*/
|
||||
async executeConditional(data: any[], params: ConditionalParams): Promise<OperationResult> {
|
||||
try {
|
||||
logger.info(`[QuickActionService] 调用条件生成列API: ${params.newColumnName}, ${params.rules.length}条规则`);
|
||||
|
||||
const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/conditional`, {
|
||||
data,
|
||||
new_column_name: params.newColumnName,
|
||||
rules: params.rules,
|
||||
else_value: params.elseValue,
|
||||
}, {
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
logger.info(`[QuickActionService] 条件生成列成功`);
|
||||
return response.data;
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(`[QuickActionService] 条件生成列失败: ${error.message}`);
|
||||
|
||||
if (error.response?.data) {
|
||||
return error.response.data;
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error.message || '条件生成列失败',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 执行删除缺失值
|
||||
*/
|
||||
async executeDropna(data: any[], params: DropnaParams): Promise<OperationResult> {
|
||||
try {
|
||||
logger.info(`[QuickActionService] 调用删除缺失值API: method=${params.method}, threshold=${params.threshold}`);
|
||||
|
||||
const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/dropna`, {
|
||||
data,
|
||||
method: params.method,
|
||||
threshold: params.threshold || 0.5,
|
||||
subset: params.subset,
|
||||
}, {
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
logger.info(`[QuickActionService] 删除缺失值成功`);
|
||||
return response.data;
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(`[QuickActionService] 删除缺失值失败: ${error.message}`);
|
||||
|
||||
if (error.response?.data) {
|
||||
return error.response.data;
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error.message || '删除缺失值失败',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 执行计算列
|
||||
*/
|
||||
async executeCompute(data: any[], params: ComputeParams): Promise<OperationResult> {
|
||||
try {
|
||||
logger.info(`[QuickActionService] 调用计算列API: ${params.newColumnName}, formula=${params.formula}`);
|
||||
|
||||
const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/compute`, {
|
||||
data,
|
||||
new_column_name: params.newColumnName,
|
||||
formula: params.formula,
|
||||
}, {
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
logger.info(`[QuickActionService] 计算列成功`);
|
||||
return response.data;
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(`[QuickActionService] 计算列失败: ${error.message}`);
|
||||
|
||||
if (error.response?.data) {
|
||||
return error.response.data;
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error.message || '计算列失败',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 执行Pivot(长表→宽表)
|
||||
*/
|
||||
async executePivot(data: any[], params: PivotParams): Promise<OperationResult> {
|
||||
try {
|
||||
logger.info(`[QuickActionService] 调用Pivot API: ${params.indexColumn} × ${params.pivotColumn}`);
|
||||
|
||||
const response = await axios.post(`${PYTHON_SERVICE_URL}/api/operations/pivot`, {
|
||||
data,
|
||||
index_column: params.indexColumn,
|
||||
pivot_column: params.pivotColumn,
|
||||
value_columns: params.valueColumns,
|
||||
aggfunc: params.aggfunc,
|
||||
}, {
|
||||
timeout: 60000,
|
||||
});
|
||||
|
||||
logger.info(`[QuickActionService] Pivot成功`);
|
||||
return response.data;
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error(`[QuickActionService] Pivot失败: ${error.message}`);
|
||||
|
||||
if (error.response?.data) {
|
||||
return error.response.data;
|
||||
}
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error.message || 'Pivot失败',
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ==================== 导出单例 ====================
|
||||
|
||||
export const quickActionService = new QuickActionService();
|
||||
|
||||
@@ -71,7 +71,13 @@ export class SessionService {
|
||||
logger.info('[SessionService] 解析Excel文件...');
|
||||
let workbook: xlsx.WorkBook;
|
||||
try {
|
||||
workbook = xlsx.read(fileBuffer, { type: 'buffer' });
|
||||
// ✅ 修复:添加解析选项,保留原始格式
|
||||
workbook = xlsx.read(fileBuffer, {
|
||||
type: 'buffer',
|
||||
raw: true, // 保留原始数据,不做类型推断
|
||||
cellText: false, // 不使用格式化文本
|
||||
cellDates: false, // 日期保持为数字
|
||||
});
|
||||
} catch (error: any) {
|
||||
throw new Error(`Excel文件解析失败: ${error.message}`);
|
||||
}
|
||||
@@ -82,7 +88,11 @@ export class SessionService {
|
||||
}
|
||||
|
||||
const sheet = workbook.Sheets[sheetName];
|
||||
const data = xlsx.utils.sheet_to_json(sheet);
|
||||
// ✅ 修复:使用 defval 选项处理空值,raw 保留原始格式
|
||||
const data = xlsx.utils.sheet_to_json(sheet, {
|
||||
raw: false, // 使用格式化后的字符串值(保留"-"等字符)
|
||||
defval: null, // 空单元格使用 null
|
||||
});
|
||||
|
||||
if (data.length === 0) {
|
||||
throw new Error('Excel文件没有数据');
|
||||
@@ -103,9 +113,15 @@ export class SessionService {
|
||||
await storage.upload(fileKey, fileBuffer);
|
||||
logger.info('[SessionService] OSS上传成功');
|
||||
|
||||
// 5. 保存Session到数据库(只存元数据,符合云原生规范)
|
||||
// 5. ✨ 计算数据统计信息(用于数据探索)
|
||||
logger.info('[SessionService] 计算数据统计信息...');
|
||||
const dataStats = this.calculateDataStats(data, columns);
|
||||
logger.info('[SessionService] 统计信息计算完成');
|
||||
|
||||
// 6. 保存Session到数据库(只存元数据,符合云原生规范)
|
||||
const expiresAt = new Date(Date.now() + SESSION_EXPIRE_MINUTES * 60 * 1000);
|
||||
|
||||
// @ts-ignore - dataStats字段在Prisma生成前可能不存在
|
||||
const session = await prisma.dcToolCSession.create({
|
||||
data: {
|
||||
userId,
|
||||
@@ -116,6 +132,7 @@ export class SessionService {
|
||||
columns: columns, // Prisma会自动转换为JSONB
|
||||
encoding: 'utf-8', // 默认utf-8,后续可扩展检测
|
||||
fileSize: fileBuffer.length,
|
||||
dataStats: JSON.parse(JSON.stringify(dataStats)), // ✨ 存储统计信息(转换为JSON)
|
||||
expiresAt,
|
||||
},
|
||||
});
|
||||
@@ -180,10 +197,18 @@ export class SessionService {
|
||||
const buffer = await storage.download(session.fileKey);
|
||||
|
||||
// 3. 内存解析Excel(不落盘)
|
||||
const workbook = xlsx.read(buffer, { type: 'buffer' });
|
||||
const workbook = xlsx.read(buffer, {
|
||||
type: 'buffer',
|
||||
raw: true,
|
||||
cellText: false,
|
||||
cellDates: false,
|
||||
});
|
||||
const sheetName = workbook.SheetNames[0];
|
||||
const sheet = workbook.Sheets[sheetName];
|
||||
const data = xlsx.utils.sheet_to_json(sheet);
|
||||
const data = xlsx.utils.sheet_to_json(sheet, {
|
||||
raw: false,
|
||||
defval: null,
|
||||
});
|
||||
|
||||
// 4. 返回前100行
|
||||
const previewData = data.slice(0, PREVIEW_ROWS);
|
||||
@@ -218,10 +243,18 @@ export class SessionService {
|
||||
const buffer = await storage.download(session.fileKey);
|
||||
|
||||
// 3. 内存解析Excel
|
||||
const workbook = xlsx.read(buffer, { type: 'buffer' });
|
||||
const workbook = xlsx.read(buffer, {
|
||||
type: 'buffer',
|
||||
raw: true,
|
||||
cellText: false,
|
||||
cellDates: false,
|
||||
});
|
||||
const sheetName = workbook.SheetNames[0];
|
||||
const sheet = workbook.Sheets[sheetName];
|
||||
const data = xlsx.utils.sheet_to_json(sheet);
|
||||
const data = xlsx.utils.sheet_to_json(sheet, {
|
||||
raw: false,
|
||||
defval: null,
|
||||
});
|
||||
|
||||
logger.info(`[SessionService] 完整数据获取成功: ${data.length}行`);
|
||||
|
||||
@@ -312,6 +345,48 @@ export class SessionService {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ✨ 保存AI处理后的完整数据到OSS
|
||||
*
|
||||
* @param sessionId - Session ID
|
||||
* @param processedData - AI处理后的完整数据
|
||||
*/
|
||||
async saveProcessedData(sessionId: string, processedData: any[]): Promise<void> {
|
||||
try {
|
||||
logger.info(`[SessionService] 保存处理数据: ${sessionId}, 行数=${processedData.length}`);
|
||||
|
||||
// 1. 获取Session信息
|
||||
const session = await this.getSession(sessionId);
|
||||
|
||||
// 2. 将数据转换为Excel Buffer
|
||||
const workbook = xlsx.utils.book_new();
|
||||
const worksheet = xlsx.utils.json_to_sheet(processedData);
|
||||
xlsx.utils.book_append_sheet(workbook, worksheet, 'Sheet1');
|
||||
const buffer = xlsx.write(workbook, { type: 'buffer', bookType: 'xlsx' });
|
||||
|
||||
// 3. 上传到OSS(覆盖原文件,保持fileKey不变)
|
||||
logger.info(`[SessionService] 上传处理后数据到OSS: ${session.fileKey}`);
|
||||
await storage.upload(session.fileKey, buffer);
|
||||
|
||||
// 4. 更新Session元数据
|
||||
const newColumns = Object.keys(processedData[0] || {});
|
||||
await prisma.dcToolCSession.update({
|
||||
where: { id: sessionId },
|
||||
data: {
|
||||
totalRows: processedData.length,
|
||||
totalCols: newColumns.length,
|
||||
columns: newColumns,
|
||||
updatedAt: new Date(),
|
||||
},
|
||||
});
|
||||
|
||||
logger.info(`[SessionService] 处理数据保存成功: ${sessionId}`);
|
||||
} catch (error: any) {
|
||||
logger.error(`[SessionService] 保存处理数据失败: ${error.message}`, { sessionId });
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 清理过期Session(定时任务使用)
|
||||
*
|
||||
@@ -352,6 +427,135 @@ export class SessionService {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ✨ 计算数据统计信息(用于数据探索)
|
||||
*
|
||||
* @param data - 完整数据数组
|
||||
* @param columns - 列名数组
|
||||
* @returns 统计信息对象
|
||||
*/
|
||||
private calculateDataStats(data: any[], columns: string[]): any {
|
||||
const totalRows = data.length;
|
||||
|
||||
const columnStats = columns.map(col => {
|
||||
// 提取该列的所有值
|
||||
const values = data.map(row => row[col]);
|
||||
|
||||
// 缺失值统计
|
||||
const missingCount = values.filter(v => v === null || v === undefined || v === '' || v === 'NA').length;
|
||||
const missingRate = ((missingCount / totalRows) * 100).toFixed(2) + '%';
|
||||
|
||||
// 唯一值数量
|
||||
const uniqueValues = new Set(values.filter(v => v !== null && v !== undefined && v !== ''));
|
||||
const uniqueCount = uniqueValues.size;
|
||||
|
||||
// 检测数据类型
|
||||
const dataType = this.detectColumnType(values);
|
||||
|
||||
// 如果是数值列,计算均值和中位数
|
||||
let mean: number | null = null;
|
||||
let median: number | null = null;
|
||||
let min: number | null = null;
|
||||
let max: number | null = null;
|
||||
|
||||
if (dataType === 'numeric') {
|
||||
const numericValues = values
|
||||
.filter(v => v !== null && v !== undefined && v !== '' && !isNaN(Number(v)))
|
||||
.map(v => Number(v));
|
||||
|
||||
if (numericValues.length > 0) {
|
||||
mean = numericValues.reduce((a, b) => a + b, 0) / numericValues.length;
|
||||
mean = Math.round(mean * 100) / 100; // 保留2位小数
|
||||
|
||||
const sorted = numericValues.slice().sort((a, b) => a - b);
|
||||
const mid = Math.floor(sorted.length / 2);
|
||||
median = sorted.length % 2 === 0
|
||||
? (sorted[mid - 1] + sorted[mid]) / 2
|
||||
: sorted[mid];
|
||||
median = Math.round(median * 100) / 100;
|
||||
|
||||
min = Math.min(...numericValues);
|
||||
max = Math.max(...numericValues);
|
||||
}
|
||||
}
|
||||
|
||||
// 如果是分类列,统计最常见的值
|
||||
let topValues: Array<{ value: string; count: number }> = [];
|
||||
if (dataType === 'categorical' && uniqueCount <= 20) {
|
||||
const valueCounts: { [key: string]: number } = {};
|
||||
values.forEach(v => {
|
||||
if (v !== null && v !== undefined && v !== '') {
|
||||
const key = String(v);
|
||||
valueCounts[key] = (valueCounts[key] || 0) + 1;
|
||||
}
|
||||
});
|
||||
|
||||
topValues = Object.entries(valueCounts)
|
||||
.map(([value, count]) => ({ value, count }))
|
||||
.sort((a, b) => b.count - a.count)
|
||||
.slice(0, 5); // 只保留前5个
|
||||
}
|
||||
|
||||
return {
|
||||
name: col,
|
||||
missingCount,
|
||||
missingRate,
|
||||
uniqueCount,
|
||||
dataType,
|
||||
...(mean !== null && { mean }),
|
||||
...(median !== null && { median }),
|
||||
...(min !== null && { min }),
|
||||
...(max !== null && { max }),
|
||||
...(topValues.length > 0 && { topValues }),
|
||||
};
|
||||
});
|
||||
|
||||
return {
|
||||
totalRows,
|
||||
totalCols: columns.length,
|
||||
columnStats,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* 检测列的数据类型
|
||||
*
|
||||
* @param values - 列值数组
|
||||
* @returns 数据类型:numeric | categorical | datetime | text
|
||||
*/
|
||||
private detectColumnType(values: any[]): string {
|
||||
// 过滤空值
|
||||
const nonNullValues = values.filter(v => v !== null && v !== undefined && v !== '');
|
||||
|
||||
if (nonNullValues.length === 0) {
|
||||
return 'unknown';
|
||||
}
|
||||
|
||||
// 检测数值类型(至少80%是数字)
|
||||
const numericCount = nonNullValues.filter(v => !isNaN(Number(v))).length;
|
||||
if (numericCount / nonNullValues.length >= 0.8) {
|
||||
return 'numeric';
|
||||
}
|
||||
|
||||
// 检测日期类型(至少80%是日期)
|
||||
const dateCount = nonNullValues.filter(v => {
|
||||
const dateStr = String(v);
|
||||
return /^\d{4}-\d{2}-\d{2}/.test(dateStr) || !isNaN(Date.parse(dateStr));
|
||||
}).length;
|
||||
if (dateCount / nonNullValues.length >= 0.8) {
|
||||
return 'datetime';
|
||||
}
|
||||
|
||||
// 检测分类类型(唯一值数量 < 总数的20%)
|
||||
const uniqueCount = new Set(nonNullValues).size;
|
||||
if (uniqueCount < nonNullValues.length * 0.2 && uniqueCount <= 50) {
|
||||
return 'categorical';
|
||||
}
|
||||
|
||||
// 默认为文本类型
|
||||
return 'text';
|
||||
}
|
||||
|
||||
/**
|
||||
* 格式化Session数据
|
||||
*
|
||||
|
||||
Reference in New Issue
Block a user