feat(ssa): Complete Phase 2A frontend integration - multi-step workflow end-to-end

Phase 2A: WorkflowPlannerService, WorkflowExecutorService, Python data quality, 6 bug fixes, DescriptiveResultView, multi-step R code/Word export, MVP UI reuse. V11 UI: Gemini-style, multi-task, single-page scroll, Word export. Architecture: Block-based rendering consensus (4 block types). New R tools: chi_square, correlation, descriptive, logistic_binary, mann_whitney, t_test_paired. Docs: dev summary, block-based plan, status updates, task list v2.0.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-02-20 23:09:27 +08:00
parent 23b422f758
commit 428a22adf2
62 changed files with 15416 additions and 299 deletions

View File

@@ -0,0 +1,353 @@
/**
* SSA DataProfile 服务 (Phase 2A)
*
* 调用 Python Tool C 生成数据画像,用于 LLM 生成分析计划
*
* 时机:用户上传数据时(时机 A
* 输出DataProfile JSON存入 SsaSession.dataProfile
*/
import axios, { AxiosInstance } from 'axios';
import { logger } from '../../../common/logging/index.js';
import { prisma } from '../../../config/database.js';
import { storage } from '../../../common/storage/index.js';
export interface DataProfile {
columns: ColumnProfile[];
summary: DataSummary;
}
export interface ColumnProfile {
name: string;
type: 'numeric' | 'categorical' | 'datetime' | 'text';
missingCount: number;
missingRate: number;
uniqueCount: number;
totalCount: number;
// 数值列
mean?: number;
std?: number;
median?: number;
min?: number;
max?: number;
q1?: number;
q3?: number;
iqr?: number;
outlierCount?: number;
outlierRate?: number;
skewness?: number;
kurtosis?: number;
// 分类列
topValues?: Array<{ value: string; count: number; percentage: number }>;
totalLevels?: number;
modeValue?: string;
modeCount?: number;
// 日期列
minDate?: string;
maxDate?: string;
dateRange?: string;
}
export interface DataSummary {
totalRows: number;
totalColumns: number;
numericColumns: number;
categoricalColumns: number;
datetimeColumns: number;
textColumns: number;
overallMissingRate: number;
totalMissingCells: number;
}
export interface QualityScore {
score: number;
grade: 'A' | 'B' | 'C' | 'D';
gradeDescription: string;
issues: string[];
recommendations: string[];
}
export interface DataProfileResult {
success: boolean;
profile?: DataProfile;
quality?: QualityScore;
executionTime?: number;
error?: string;
}
export class DataProfileService {
private client: AxiosInstance;
constructor() {
const baseURL = process.env.EXTRACTION_SERVICE_URL || 'http://localhost:8000';
this.client = axios.create({
baseURL,
timeout: 60000,
headers: {
'Content-Type': 'application/json'
}
});
}
/**
* 为 SSA Session 生成数据画像
*
* @param sessionId SSA 会话 ID
* @param data 数据数组JSON 格式)
* @returns DataProfile 结果
*/
async generateProfile(sessionId: string, data: Record<string, any>[]): Promise<DataProfileResult> {
const startTime = Date.now();
try {
logger.info('[SSA:DataProfile] Generating profile', {
sessionId,
rowCount: data.length,
columnCount: data.length > 0 ? Object.keys(data[0]).length : 0
});
const response = await this.client.post('/api/ssa/data-profile', {
data,
max_unique_values: 20,
include_quality_score: true
});
if (!response.data.success) {
throw new Error(response.data.error || 'Profile generation failed');
}
const result: DataProfileResult = {
success: true,
profile: response.data.profile,
quality: response.data.quality,
executionTime: response.data.execution_time
};
// 保存到数据库
await this.saveProfileToSession(sessionId, result);
const executionMs = Date.now() - startTime;
logger.info('[SSA:DataProfile] Profile generated successfully', {
sessionId,
executionMs,
summary: result.profile?.summary
});
return result;
} catch (error: any) {
const executionMs = Date.now() - startTime;
logger.error('[SSA:DataProfile] Profile generation failed', {
sessionId,
error: error.message,
executionMs
});
return {
success: false,
error: error.message,
executionTime: executionMs / 1000
};
}
}
/**
* 从 CSV 内容直接生成画像(让 Python pandas 解析 CSV
*
* @param sessionId SSA 会话 ID
* @param csvContent CSV 文件内容
* @returns DataProfile 结果
*/
async generateProfileFromCSV(sessionId: string, csvContent: string): Promise<DataProfileResult> {
const startTime = Date.now();
try {
logger.info('[SSA:DataProfile] Generating profile from CSV', {
sessionId,
contentLength: csvContent.length
});
// 直接发送 CSV 内容给 Python 服务,让 pandas 解析
const response = await this.client.post('/api/ssa/data-profile-csv', {
csv_content: csvContent,
max_unique_values: 20,
include_quality_score: true
});
if (!response.data.success) {
throw new Error(response.data.error || 'Profile generation failed');
}
const result: DataProfileResult = {
success: true,
profile: response.data.profile,
quality: response.data.quality,
executionTime: response.data.execution_time
};
// 保存到数据库
await this.saveProfileToSession(sessionId, result);
const executionMs = Date.now() - startTime;
logger.info('[SSA:DataProfile] Profile generated from CSV successfully', {
sessionId,
executionMs,
summary: result.profile?.summary
});
return result;
} catch (error: any) {
const executionMs = Date.now() - startTime;
logger.error('[SSA:DataProfile] CSV profile generation failed', {
sessionId,
error: error.message,
executionMs
});
return {
success: false,
error: error.message,
executionTime: executionMs / 1000
};
}
}
/**
* 从 OSS 加载数据并生成画像
*
* @param sessionId SSA 会话 ID
* @returns DataProfile 结果
*/
async generateProfileFromSession(sessionId: string): Promise<DataProfileResult> {
try {
const session = await prisma.ssaSession.findUnique({
where: { id: sessionId }
});
if (!session) {
throw new Error(`Session not found: ${sessionId}`);
}
// 如果已有画像,直接返回
if (session.dataProfile) {
logger.info('[SSA:DataProfile] Using cached profile', { sessionId });
return {
success: true,
profile: session.dataProfile as unknown as DataProfile
};
}
// 从 dataPayload 或 OSS 加载数据
if (session.dataPayload) {
// JSON 格式数据,直接调用原方法
const data = session.dataPayload as unknown as Record<string, any>[];
return await this.generateProfile(sessionId, data);
} else if (session.dataOssKey) {
// 从 OSS 下载文件
const buffer = await storage.download(session.dataOssKey);
const content = buffer.toString('utf-8');
// 检测文件格式JSON 或 CSV
const trimmedContent = content.trim();
if (trimmedContent.startsWith('[') || trimmedContent.startsWith('{')) {
// JSON 格式
const data = JSON.parse(content);
return await this.generateProfile(sessionId, data);
} else {
// CSV 格式,直接发给 Python 解析(更高效、更可靠)
return await this.generateProfileFromCSV(sessionId, content);
}
} else {
throw new Error('No data available for session');
}
} catch (error: any) {
logger.error('[SSA:DataProfile] Failed to generate profile from session', {
sessionId,
error: error.message
});
return {
success: false,
error: error.message
};
}
}
/**
* 保存画像到 Session
*/
private async saveProfileToSession(sessionId: string, result: DataProfileResult): Promise<void> {
try {
await prisma.ssaSession.update({
where: { id: sessionId },
data: {
dataProfile: result.profile as any
}
});
logger.info('[SSA:DataProfile] Profile saved to session', { sessionId });
} catch (error: any) {
logger.error('[SSA:DataProfile] Failed to save profile', {
sessionId,
error: error.message
});
}
}
/**
* 获取已缓存的画像
*/
async getCachedProfile(sessionId: string): Promise<DataProfile | null> {
const session = await prisma.ssaSession.findUnique({
where: { id: sessionId },
select: { dataProfile: true }
});
return session?.dataProfile as unknown as DataProfile | null;
}
/**
* 为 LLM 生成精简版画像摘要
* 用于 Prompt 注入,控制 Token 消耗
*/
generateProfileSummaryForLLM(profile: DataProfile): string {
const { summary, columns } = profile;
const lines: string[] = [
`## 数据概况`,
`- 样本量: ${summary.totalRows}`,
`- 变量数: ${summary.totalColumns} 列 (${summary.numericColumns} 数值, ${summary.categoricalColumns} 分类)`,
`- 整体缺失率: ${summary.overallMissingRate}%`,
'',
`## 变量清单`
];
for (const col of columns) {
let desc = `- **${col.name}** [${col.type}]`;
if (col.missingRate > 0) {
desc += ` (缺失 ${col.missingRate}%)`;
}
if (col.type === 'numeric') {
desc += `: 均值=${col.mean}, SD=${col.std}, 范围=[${col.min}, ${col.max}]`;
if (col.outlierCount && col.outlierCount > 0) {
desc += `, ${col.outlierCount}个异常值`;
}
} else if (col.type === 'categorical') {
const levels = col.topValues?.slice(0, 5).map(v => v.value).join(', ');
desc += `: ${col.totalLevels}个水平 (${levels}${col.totalLevels && col.totalLevels > 5 ? '...' : ''})`;
}
lines.push(desc);
}
return lines.join('\n');
}
}
// 单例导出
export const dataProfileService = new DataProfileService();