feat(ssa): Complete Phase 2A frontend integration - multi-step workflow end-to-end
Phase 2A: WorkflowPlannerService, WorkflowExecutorService, Python data quality, 6 bug fixes, DescriptiveResultView, multi-step R code/Word export, MVP UI reuse. V11 UI: Gemini-style, multi-task, single-page scroll, Word export. Architecture: Block-based rendering consensus (4 block types). New R tools: chi_square, correlation, descriptive, logistic_binary, mann_whitney, t_test_paired. Docs: dev summary, block-based plan, status updates, task list v2.0. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
353
backend/src/modules/ssa/services/DataProfileService.ts
Normal file
353
backend/src/modules/ssa/services/DataProfileService.ts
Normal file
@@ -0,0 +1,353 @@
|
||||
/**
|
||||
* SSA DataProfile 服务 (Phase 2A)
|
||||
*
|
||||
* 调用 Python Tool C 生成数据画像,用于 LLM 生成分析计划
|
||||
*
|
||||
* 时机:用户上传数据时(时机 A)
|
||||
* 输出:DataProfile JSON,存入 SsaSession.dataProfile
|
||||
*/
|
||||
|
||||
import axios, { AxiosInstance } from 'axios';
|
||||
import { logger } from '../../../common/logging/index.js';
|
||||
import { prisma } from '../../../config/database.js';
|
||||
import { storage } from '../../../common/storage/index.js';
|
||||
|
||||
export interface DataProfile {
|
||||
columns: ColumnProfile[];
|
||||
summary: DataSummary;
|
||||
}
|
||||
|
||||
export interface ColumnProfile {
|
||||
name: string;
|
||||
type: 'numeric' | 'categorical' | 'datetime' | 'text';
|
||||
missingCount: number;
|
||||
missingRate: number;
|
||||
uniqueCount: number;
|
||||
totalCount: number;
|
||||
// 数值列
|
||||
mean?: number;
|
||||
std?: number;
|
||||
median?: number;
|
||||
min?: number;
|
||||
max?: number;
|
||||
q1?: number;
|
||||
q3?: number;
|
||||
iqr?: number;
|
||||
outlierCount?: number;
|
||||
outlierRate?: number;
|
||||
skewness?: number;
|
||||
kurtosis?: number;
|
||||
// 分类列
|
||||
topValues?: Array<{ value: string; count: number; percentage: number }>;
|
||||
totalLevels?: number;
|
||||
modeValue?: string;
|
||||
modeCount?: number;
|
||||
// 日期列
|
||||
minDate?: string;
|
||||
maxDate?: string;
|
||||
dateRange?: string;
|
||||
}
|
||||
|
||||
export interface DataSummary {
|
||||
totalRows: number;
|
||||
totalColumns: number;
|
||||
numericColumns: number;
|
||||
categoricalColumns: number;
|
||||
datetimeColumns: number;
|
||||
textColumns: number;
|
||||
overallMissingRate: number;
|
||||
totalMissingCells: number;
|
||||
}
|
||||
|
||||
export interface QualityScore {
|
||||
score: number;
|
||||
grade: 'A' | 'B' | 'C' | 'D';
|
||||
gradeDescription: string;
|
||||
issues: string[];
|
||||
recommendations: string[];
|
||||
}
|
||||
|
||||
export interface DataProfileResult {
|
||||
success: boolean;
|
||||
profile?: DataProfile;
|
||||
quality?: QualityScore;
|
||||
executionTime?: number;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export class DataProfileService {
|
||||
private client: AxiosInstance;
|
||||
|
||||
constructor() {
|
||||
const baseURL = process.env.EXTRACTION_SERVICE_URL || 'http://localhost:8000';
|
||||
|
||||
this.client = axios.create({
|
||||
baseURL,
|
||||
timeout: 60000,
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* 为 SSA Session 生成数据画像
|
||||
*
|
||||
* @param sessionId SSA 会话 ID
|
||||
* @param data 数据数组(JSON 格式)
|
||||
* @returns DataProfile 结果
|
||||
*/
|
||||
async generateProfile(sessionId: string, data: Record<string, any>[]): Promise<DataProfileResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
logger.info('[SSA:DataProfile] Generating profile', {
|
||||
sessionId,
|
||||
rowCount: data.length,
|
||||
columnCount: data.length > 0 ? Object.keys(data[0]).length : 0
|
||||
});
|
||||
|
||||
const response = await this.client.post('/api/ssa/data-profile', {
|
||||
data,
|
||||
max_unique_values: 20,
|
||||
include_quality_score: true
|
||||
});
|
||||
|
||||
if (!response.data.success) {
|
||||
throw new Error(response.data.error || 'Profile generation failed');
|
||||
}
|
||||
|
||||
const result: DataProfileResult = {
|
||||
success: true,
|
||||
profile: response.data.profile,
|
||||
quality: response.data.quality,
|
||||
executionTime: response.data.execution_time
|
||||
};
|
||||
|
||||
// 保存到数据库
|
||||
await this.saveProfileToSession(sessionId, result);
|
||||
|
||||
const executionMs = Date.now() - startTime;
|
||||
logger.info('[SSA:DataProfile] Profile generated successfully', {
|
||||
sessionId,
|
||||
executionMs,
|
||||
summary: result.profile?.summary
|
||||
});
|
||||
|
||||
return result;
|
||||
|
||||
} catch (error: any) {
|
||||
const executionMs = Date.now() - startTime;
|
||||
logger.error('[SSA:DataProfile] Profile generation failed', {
|
||||
sessionId,
|
||||
error: error.message,
|
||||
executionMs
|
||||
});
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error.message,
|
||||
executionTime: executionMs / 1000
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 从 CSV 内容直接生成画像(让 Python pandas 解析 CSV)
|
||||
*
|
||||
* @param sessionId SSA 会话 ID
|
||||
* @param csvContent CSV 文件内容
|
||||
* @returns DataProfile 结果
|
||||
*/
|
||||
async generateProfileFromCSV(sessionId: string, csvContent: string): Promise<DataProfileResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
logger.info('[SSA:DataProfile] Generating profile from CSV', {
|
||||
sessionId,
|
||||
contentLength: csvContent.length
|
||||
});
|
||||
|
||||
// 直接发送 CSV 内容给 Python 服务,让 pandas 解析
|
||||
const response = await this.client.post('/api/ssa/data-profile-csv', {
|
||||
csv_content: csvContent,
|
||||
max_unique_values: 20,
|
||||
include_quality_score: true
|
||||
});
|
||||
|
||||
if (!response.data.success) {
|
||||
throw new Error(response.data.error || 'Profile generation failed');
|
||||
}
|
||||
|
||||
const result: DataProfileResult = {
|
||||
success: true,
|
||||
profile: response.data.profile,
|
||||
quality: response.data.quality,
|
||||
executionTime: response.data.execution_time
|
||||
};
|
||||
|
||||
// 保存到数据库
|
||||
await this.saveProfileToSession(sessionId, result);
|
||||
|
||||
const executionMs = Date.now() - startTime;
|
||||
logger.info('[SSA:DataProfile] Profile generated from CSV successfully', {
|
||||
sessionId,
|
||||
executionMs,
|
||||
summary: result.profile?.summary
|
||||
});
|
||||
|
||||
return result;
|
||||
|
||||
} catch (error: any) {
|
||||
const executionMs = Date.now() - startTime;
|
||||
logger.error('[SSA:DataProfile] CSV profile generation failed', {
|
||||
sessionId,
|
||||
error: error.message,
|
||||
executionMs
|
||||
});
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error.message,
|
||||
executionTime: executionMs / 1000
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 从 OSS 加载数据并生成画像
|
||||
*
|
||||
* @param sessionId SSA 会话 ID
|
||||
* @returns DataProfile 结果
|
||||
*/
|
||||
async generateProfileFromSession(sessionId: string): Promise<DataProfileResult> {
|
||||
try {
|
||||
const session = await prisma.ssaSession.findUnique({
|
||||
where: { id: sessionId }
|
||||
});
|
||||
|
||||
if (!session) {
|
||||
throw new Error(`Session not found: ${sessionId}`);
|
||||
}
|
||||
|
||||
// 如果已有画像,直接返回
|
||||
if (session.dataProfile) {
|
||||
logger.info('[SSA:DataProfile] Using cached profile', { sessionId });
|
||||
return {
|
||||
success: true,
|
||||
profile: session.dataProfile as unknown as DataProfile
|
||||
};
|
||||
}
|
||||
|
||||
// 从 dataPayload 或 OSS 加载数据
|
||||
if (session.dataPayload) {
|
||||
// JSON 格式数据,直接调用原方法
|
||||
const data = session.dataPayload as unknown as Record<string, any>[];
|
||||
return await this.generateProfile(sessionId, data);
|
||||
} else if (session.dataOssKey) {
|
||||
// 从 OSS 下载文件
|
||||
const buffer = await storage.download(session.dataOssKey);
|
||||
const content = buffer.toString('utf-8');
|
||||
|
||||
// 检测文件格式:JSON 或 CSV
|
||||
const trimmedContent = content.trim();
|
||||
if (trimmedContent.startsWith('[') || trimmedContent.startsWith('{')) {
|
||||
// JSON 格式
|
||||
const data = JSON.parse(content);
|
||||
return await this.generateProfile(sessionId, data);
|
||||
} else {
|
||||
// CSV 格式,直接发给 Python 解析(更高效、更可靠)
|
||||
return await this.generateProfileFromCSV(sessionId, content);
|
||||
}
|
||||
} else {
|
||||
throw new Error('No data available for session');
|
||||
}
|
||||
|
||||
} catch (error: any) {
|
||||
logger.error('[SSA:DataProfile] Failed to generate profile from session', {
|
||||
sessionId,
|
||||
error: error.message
|
||||
});
|
||||
|
||||
return {
|
||||
success: false,
|
||||
error: error.message
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 保存画像到 Session
|
||||
*/
|
||||
private async saveProfileToSession(sessionId: string, result: DataProfileResult): Promise<void> {
|
||||
try {
|
||||
await prisma.ssaSession.update({
|
||||
where: { id: sessionId },
|
||||
data: {
|
||||
dataProfile: result.profile as any
|
||||
}
|
||||
});
|
||||
|
||||
logger.info('[SSA:DataProfile] Profile saved to session', { sessionId });
|
||||
} catch (error: any) {
|
||||
logger.error('[SSA:DataProfile] Failed to save profile', {
|
||||
sessionId,
|
||||
error: error.message
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取已缓存的画像
|
||||
*/
|
||||
async getCachedProfile(sessionId: string): Promise<DataProfile | null> {
|
||||
const session = await prisma.ssaSession.findUnique({
|
||||
where: { id: sessionId },
|
||||
select: { dataProfile: true }
|
||||
});
|
||||
|
||||
return session?.dataProfile as unknown as DataProfile | null;
|
||||
}
|
||||
|
||||
/**
|
||||
* 为 LLM 生成精简版画像摘要
|
||||
* 用于 Prompt 注入,控制 Token 消耗
|
||||
*/
|
||||
generateProfileSummaryForLLM(profile: DataProfile): string {
|
||||
const { summary, columns } = profile;
|
||||
|
||||
const lines: string[] = [
|
||||
`## 数据概况`,
|
||||
`- 样本量: ${summary.totalRows} 行`,
|
||||
`- 变量数: ${summary.totalColumns} 列 (${summary.numericColumns} 数值, ${summary.categoricalColumns} 分类)`,
|
||||
`- 整体缺失率: ${summary.overallMissingRate}%`,
|
||||
'',
|
||||
`## 变量清单`
|
||||
];
|
||||
|
||||
for (const col of columns) {
|
||||
let desc = `- **${col.name}** [${col.type}]`;
|
||||
|
||||
if (col.missingRate > 0) {
|
||||
desc += ` (缺失 ${col.missingRate}%)`;
|
||||
}
|
||||
|
||||
if (col.type === 'numeric') {
|
||||
desc += `: 均值=${col.mean}, SD=${col.std}, 范围=[${col.min}, ${col.max}]`;
|
||||
if (col.outlierCount && col.outlierCount > 0) {
|
||||
desc += `, ${col.outlierCount}个异常值`;
|
||||
}
|
||||
} else if (col.type === 'categorical') {
|
||||
const levels = col.topValues?.slice(0, 5).map(v => v.value).join(', ');
|
||||
desc += `: ${col.totalLevels}个水平 (${levels}${col.totalLevels && col.totalLevels > 5 ? '...' : ''})`;
|
||||
}
|
||||
|
||||
lines.push(desc);
|
||||
}
|
||||
|
||||
return lines.join('\n');
|
||||
}
|
||||
}
|
||||
|
||||
// 单例导出
|
||||
export const dataProfileService = new DataProfileService();
|
||||
Reference in New Issue
Block a user