feat(ssa): Complete Phase 2A frontend integration - multi-step workflow end-to-end

Phase 2A: WorkflowPlannerService, WorkflowExecutorService, Python data quality, 6 bug fixes, DescriptiveResultView, multi-step R code/Word export, MVP UI reuse. V11 UI: Gemini-style, multi-task, single-page scroll, Word export. Architecture: Block-based rendering consensus (4 block types). New R tools: chi_square, correlation, descriptive, logistic_binary, mann_whitney, t_test_paired. Docs: dev summary, block-based plan, status updates, task list v2.0.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-02-20 23:09:27 +08:00
parent 23b422f758
commit 428a22adf2
62 changed files with 15416 additions and 299 deletions

View File

@@ -0,0 +1,100 @@
-- =====================================================
-- Phase 2A: SSA 智能化核心 - 数据库迁移脚本
-- 日期: 2026-02-20
-- 描述: 添加工作流表和数据画像字段
-- 注意: ssa_sessions.id 是 TEXT 类型(存储 UUID 字符串)
-- =====================================================
-- 1. 给 ssa_sessions 表添加 data_profile 字段(如果不存在)
ALTER TABLE ssa_schema.ssa_sessions
ADD COLUMN IF NOT EXISTS data_profile JSONB;
COMMENT ON COLUMN ssa_schema.ssa_sessions.data_profile IS 'Python Tool C 生成的数据画像 (Phase 2A)';
-- 2. 创建 ssa_workflows 表(多步骤分析流程)
CREATE TABLE IF NOT EXISTS ssa_schema.ssa_workflows (
id TEXT PRIMARY KEY DEFAULT gen_random_uuid()::TEXT,
session_id TEXT NOT NULL,
message_id TEXT,
status VARCHAR(20) NOT NULL DEFAULT 'pending',
total_steps INTEGER NOT NULL,
completed_steps INTEGER NOT NULL DEFAULT 0,
workflow_plan JSONB NOT NULL,
reasoning TEXT,
created_at TIMESTAMP WITHOUT TIME ZONE NOT NULL DEFAULT NOW(),
started_at TIMESTAMP WITHOUT TIME ZONE,
completed_at TIMESTAMP WITHOUT TIME ZONE,
CONSTRAINT fk_ssa_workflow_session
FOREIGN KEY (session_id)
REFERENCES ssa_schema.ssa_sessions(id)
ON DELETE CASCADE
);
-- ssa_workflows 索引
CREATE INDEX IF NOT EXISTS idx_ssa_workflow_session
ON ssa_schema.ssa_workflows(session_id);
CREATE INDEX IF NOT EXISTS idx_ssa_workflow_status
ON ssa_schema.ssa_workflows(status);
-- ssa_workflows 字段注释
COMMENT ON TABLE ssa_schema.ssa_workflows IS 'SSA 多步骤分析工作流 (Phase 2A)';
COMMENT ON COLUMN ssa_schema.ssa_workflows.status IS 'pending | running | completed | partial | error';
COMMENT ON COLUMN ssa_schema.ssa_workflows.workflow_plan IS 'LLM 生成的原始工作流计划 JSON';
COMMENT ON COLUMN ssa_schema.ssa_workflows.reasoning IS 'LLM 规划理由说明';
-- 3. 创建 ssa_workflow_steps 表(流程中的每个步骤)
CREATE TABLE IF NOT EXISTS ssa_schema.ssa_workflow_steps (
id TEXT PRIMARY KEY DEFAULT gen_random_uuid()::TEXT,
workflow_id TEXT NOT NULL,
step_order INTEGER NOT NULL,
tool_code VARCHAR(50) NOT NULL,
tool_name VARCHAR(100) NOT NULL,
status VARCHAR(20) NOT NULL DEFAULT 'pending',
input_params JSONB,
guardrail_checks JSONB,
output_result JSONB,
error_info JSONB,
execution_ms INTEGER,
started_at TIMESTAMP WITHOUT TIME ZONE,
completed_at TIMESTAMP WITHOUT TIME ZONE,
CONSTRAINT fk_ssa_workflow_step_workflow
FOREIGN KEY (workflow_id)
REFERENCES ssa_schema.ssa_workflows(id)
ON DELETE CASCADE
);
-- ssa_workflow_steps 索引
CREATE INDEX IF NOT EXISTS idx_ssa_workflow_step_workflow
ON ssa_schema.ssa_workflow_steps(workflow_id);
CREATE INDEX IF NOT EXISTS idx_ssa_workflow_step_status
ON ssa_schema.ssa_workflow_steps(status);
-- ssa_workflow_steps 字段注释
COMMENT ON TABLE ssa_schema.ssa_workflow_steps IS 'SSA 工作流单步执行记录 (Phase 2A)';
COMMENT ON COLUMN ssa_schema.ssa_workflow_steps.status IS 'pending | running | success | warning | error | skipped';
COMMENT ON COLUMN ssa_schema.ssa_workflow_steps.guardrail_checks IS 'R Service JIT 护栏检验结果 (正态性、方差齐性等)';
COMMENT ON COLUMN ssa_schema.ssa_workflow_steps.output_result IS '工具执行结果 (已裁剪,符合 LLM 上下文限制)';
COMMENT ON COLUMN ssa_schema.ssa_workflow_steps.error_info IS '错误信息 (用于容错管道的部分成功场景)';
-- =====================================================
-- 验证脚本
-- =====================================================
SELECT 'ssa_sessions.data_profile 字段' as item,
CASE WHEN EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_schema = 'ssa_schema' AND table_name = 'ssa_sessions' AND column_name = 'data_profile'
) THEN '✅ 已创建' ELSE '❌ 未创建' END as status;
SELECT 'ssa_workflows 表' as item,
CASE WHEN EXISTS (
SELECT 1 FROM information_schema.tables
WHERE table_schema = 'ssa_schema' AND table_name = 'ssa_workflows'
) THEN '✅ 已创建' ELSE '❌ 未创建' END as status;
SELECT 'ssa_workflow_steps 表' as item,
CASE WHEN EXISTS (
SELECT 1 FROM information_schema.tables
WHERE table_schema = 'ssa_schema' AND table_name = 'ssa_workflow_steps'
) THEN '✅ 已创建' ELSE '❌ 未创建' END as status;

View File

@@ -2153,12 +2153,14 @@ model SsaSession {
dataSchema Json? @map("data_schema") /// 数据结构LLM可见
dataPayload Json? @map("data_payload") /// 真实数据仅R可见
dataOssKey String? @map("data_oss_key") /// OSS 存储 key大数据
dataProfile Json? @map("data_profile") /// 🆕 Python 生成的 DataProfilePhase 2A
status String @default("active") /// active | consult | completed | error
createdAt DateTime @default(now()) @map("created_at")
updatedAt DateTime @updatedAt @map("updated_at")
messages SsaMessage[]
executionLogs SsaExecutionLog[]
workflows SsaWorkflow[] /// 🆕 多步骤流程Phase 2A
@@index([userId], map: "idx_ssa_session_user")
@@index([status], map: "idx_ssa_session_status")
@@ -2306,3 +2308,54 @@ model SsaInterpretation {
@@map("ssa_interpretation_templates")
@@schema("ssa_schema")
}
// ============================================================
// 🆕 Phase 2A 新增:多步骤流程管理
// ============================================================
/// SSA 多步骤流程
model SsaWorkflow {
id String @id @default(uuid())
sessionId String @map("session_id")
messageId String? @map("message_id") /// 关联的计划消息
status String @default("pending") /// pending | running | completed | partial | error
totalSteps Int @map("total_steps")
completedSteps Int @default(0) @map("completed_steps")
workflowPlan Json @map("workflow_plan") /// 原始计划 JSON
reasoning String? @db.Text /// LLM 规划理由
createdAt DateTime @default(now()) @map("created_at")
startedAt DateTime? @map("started_at")
completedAt DateTime? @map("completed_at")
session SsaSession @relation(fields: [sessionId], references: [id], onDelete: Cascade)
steps SsaWorkflowStep[]
@@index([sessionId], map: "idx_ssa_workflow_session")
@@index([status], map: "idx_ssa_workflow_status")
@@map("ssa_workflows")
@@schema("ssa_schema")
}
/// SSA 流程步骤
model SsaWorkflowStep {
id String @id @default(uuid())
workflowId String @map("workflow_id")
stepOrder Int @map("step_order") /// 步骤顺序1, 2, 3...
toolCode String @map("tool_code")
toolName String @map("tool_name")
status String @default("pending") /// pending | running | success | warning | error | skipped
inputParams Json? @map("input_params") /// 输入参数
guardrailChecks Json? @map("guardrail_checks") /// JIT 护栏检验结果
outputResult Json? @map("output_result") /// 执行结果
errorInfo Json? @map("error_info") /// 错误信息
executionMs Int? @map("execution_ms") /// 执行耗时(毫秒)
startedAt DateTime? @map("started_at")
completedAt DateTime? @map("completed_at")
workflow SsaWorkflow @relation(fields: [workflowId], references: [id], onDelete: Cascade)
@@index([workflowId], map: "idx_ssa_workflow_step_workflow")
@@index([status], map: "idx_ssa_workflow_step_status")
@@map("ssa_workflow_steps")
@@schema("ssa_schema")
}

View File

@@ -13,6 +13,7 @@ import sessionRoutes from './routes/session.routes.js';
import analysisRoutes from './routes/analysis.routes.js';
import consultRoutes from './routes/consult.routes.js';
import configRoutes from './routes/config.routes.js';
import workflowRoutes from './routes/workflow.routes.js';
export async function ssaRoutes(app: FastifyInstance) {
// 注册认证中间件(遵循模块认证规范)
@@ -23,6 +24,8 @@ export async function ssaRoutes(app: FastifyInstance) {
app.register(analysisRoutes, { prefix: '/sessions' });
app.register(consultRoutes, { prefix: '/consult' });
app.register(configRoutes, { prefix: '/config' });
// Phase 2A: 多步骤工作流
app.register(workflowRoutes, { prefix: '/workflow' });
}
export default ssaRoutes;

View File

@@ -0,0 +1,430 @@
/**
* SSA Workflow Routes (Phase 2A)
*
* 多步骤工作流 API
* - POST /plan - 生成工作流计划
* - POST /:workflowId/execute - 执行工作流
* - GET /:workflowId/status - 获取执行状态
* - GET /:workflowId/stream - SSE 实时进度
*/
import { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
import { logger } from '../../../common/logging/index.js';
import { workflowPlannerService } from '../services/WorkflowPlannerService.js';
import { workflowExecutorService } from '../services/WorkflowExecutorService.js';
import { dataProfileService } from '../services/DataProfileService.js';
// 请求类型定义
interface PlanWorkflowBody {
sessionId: string;
userQuery: string;
}
interface ExecuteWorkflowParams {
workflowId: string;
}
interface WorkflowStatusParams {
workflowId: string;
}
interface GenerateProfileBody {
sessionId: string;
}
export default async function workflowRoutes(app: FastifyInstance) {
/**
* POST /workflow/plan
* 生成多步骤工作流计划
*/
app.post<{ Body: PlanWorkflowBody }>(
'/plan',
async (request, reply) => {
const { sessionId, userQuery } = request.body;
if (!sessionId || !userQuery) {
return reply.status(400).send({
success: false,
error: 'sessionId and userQuery are required'
});
}
try {
logger.info('[SSA:API] Planning workflow', { sessionId, userQuery });
const plan = await workflowPlannerService.planWorkflow(sessionId, userQuery);
return reply.send({
success: true,
plan
});
} catch (error: any) {
logger.error('[SSA:API] Workflow planning failed', {
sessionId,
error: error.message
});
return reply.status(500).send({
success: false,
error: error.message
});
}
}
);
/**
* POST /workflow/:workflowId/execute
* 执行工作流
*/
app.post<{ Params: ExecuteWorkflowParams; Body: { sessionId: string } }>(
'/:workflowId/execute',
async (request, reply) => {
const { workflowId } = request.params;
const { sessionId } = request.body;
if (!sessionId) {
return reply.status(400).send({
success: false,
error: 'sessionId is required'
});
}
try {
logger.info('[SSA:API] Executing workflow', { workflowId, sessionId });
const result = await workflowExecutorService.executeWorkflow(workflowId, sessionId);
return reply.send({
success: true,
result
});
} catch (error: any) {
logger.error('[SSA:API] Workflow execution failed', {
workflowId,
error: error.message
});
return reply.status(500).send({
success: false,
error: error.message
});
}
}
);
/**
* GET /workflow/:workflowId/status
* 获取工作流状态
*/
app.get<{ Params: WorkflowStatusParams }>(
'/:workflowId/status',
async (request, reply) => {
const { workflowId } = request.params;
try {
const status = await workflowExecutorService.getWorkflowStatus(workflowId);
if (!status) {
return reply.status(404).send({
success: false,
error: 'Workflow not found'
});
}
return reply.send({
success: true,
workflow: status
});
} catch (error: any) {
logger.error('[SSA:API] Get workflow status failed', {
workflowId,
error: error.message
});
return reply.status(500).send({
success: false,
error: error.message
});
}
}
);
/**
* GET /workflow/:workflowId/stream
* SSE 实时进度流 - 连接后自动开始执行
*/
app.get<{ Params: WorkflowStatusParams }>(
'/:workflowId/stream',
async (request, reply) => {
const { workflowId } = request.params;
logger.info('[SSA:SSE] Stream connected', { workflowId });
// 设置 SSE 响应头
reply.raw.writeHead(200, {
'Content-Type': 'text/event-stream',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Access-Control-Allow-Origin': '*'
});
// 发送初始连接确认
reply.raw.write(`data: ${JSON.stringify({ type: 'connected', workflowId })}\n\n`);
// 发送心跳
const heartbeat = setInterval(() => {
reply.raw.write(':heartbeat\n\n');
}, 15000);
let isCompleted = false;
// 监听进度事件
const onProgress = (message: any) => {
// 添加 workflowId 到消息中
const enrichedMessage = { ...message, workflowId };
reply.raw.write(`data: ${JSON.stringify(enrichedMessage)}\n\n`);
// 如果工作流完成,标记并清理
if (message.type === 'workflow_complete' || message.type === 'workflow_error') {
isCompleted = true;
cleanup();
}
};
workflowExecutorService.on('progress', onProgress);
// 清理函数
const cleanup = () => {
clearInterval(heartbeat);
workflowExecutorService.off('progress', onProgress);
if (!isCompleted) {
reply.raw.write(`data: ${JSON.stringify({ type: 'disconnected' })}\n\n`);
}
reply.raw.end();
};
// 客户端断开连接时清理
request.raw.on('close', cleanup);
// 获取 workflow 的 session_id 并启动执行
try {
const workflow = await import('../../../config/database.js').then(m =>
m.prisma.ssaWorkflow.findUnique({
where: { id: workflowId },
select: { sessionId: true, status: true }
})
);
if (!workflow) {
reply.raw.write(`data: ${JSON.stringify({
type: 'workflow_error',
error: 'Workflow not found',
workflowId
})}\n\n`);
cleanup();
return;
}
// 如果已完成,直接返回状态
if (workflow.status === 'completed' || workflow.status === 'failed') {
reply.raw.write(`data: ${JSON.stringify({
type: 'workflow_complete',
status: workflow.status,
workflowId
})}\n\n`);
cleanup();
return;
}
// 异步启动执行(不阻塞 SSE 连接)
logger.info('[SSA:SSE] Starting workflow execution', { workflowId, sessionId: workflow.sessionId });
workflowExecutorService.executeWorkflow(workflowId, workflow.sessionId)
.catch((error: any) => {
logger.error('[SSA:SSE] Workflow execution failed', { workflowId, error: error.message });
reply.raw.write(`data: ${JSON.stringify({
type: 'workflow_error',
error: error.message,
workflowId
})}\n\n`);
cleanup();
});
} catch (error: any) {
logger.error('[SSA:SSE] Failed to start workflow', { workflowId, error: error.message });
reply.raw.write(`data: ${JSON.stringify({
type: 'workflow_error',
error: error.message,
workflowId
})}\n\n`);
cleanup();
}
}
);
/**
* POST /workflow/profile
* 生成数据画像
*/
app.post<{ Body: GenerateProfileBody }>(
'/profile',
async (request, reply) => {
const { sessionId } = request.body;
if (!sessionId) {
return reply.status(400).send({
success: false,
error: 'sessionId is required'
});
}
try {
logger.info('[SSA:API] Generating data profile', { sessionId });
const result = await dataProfileService.generateProfileFromSession(sessionId);
if (!result.success || !result.profile) {
// 如果画像生成失败,返回基于 session schema 的简化版本
const session = await import('../../../config/database.js').then(m => m.prisma.ssaSession.findUnique({
where: { id: sessionId }
}));
if (session?.dataSchema) {
const schema = session.dataSchema as any;
const fallbackProfile = generateFallbackProfile(schema, session.title || 'data.csv');
return reply.send({
success: true,
profile: fallbackProfile
});
}
return reply.send({
success: false,
error: result.error || 'Profile generation failed'
});
}
// 转换为前端期望的格式
const frontendProfile = convertToFrontendFormat(result.profile, result.quality);
return reply.send({
success: true,
profile: frontendProfile
});
} catch (error: any) {
logger.error('[SSA:API] Profile generation failed', {
sessionId,
error: error.message
});
return reply.status(500).send({
success: false,
error: error.message
});
}
}
);
}
/**
* 将后端 DataProfile 转换为前端期望的格式
*/
function convertToFrontendFormat(profile: any, quality?: any) {
const summary = profile.summary || {};
const columns = profile.columns || [];
return {
file_name: 'data.csv',
row_count: summary.totalRows || 0,
column_count: summary.totalColumns || 0,
total_cells: (summary.totalRows || 0) * (summary.totalColumns || 0),
missing_cells: summary.totalMissingCells || 0,
missing_ratio: (summary.overallMissingRate || 0) / 100,
duplicate_rows: 0,
duplicate_ratio: 0,
numeric_columns: summary.numericColumns || 0,
categorical_columns: summary.categoricalColumns || 0,
datetime_columns: summary.datetimeColumns || 0,
quality_score: quality?.score || 85,
quality_grade: quality?.grade || 'B',
columns: columns.map((col: any) => ({
name: col.name,
dtype: col.type,
inferred_type: col.type,
non_null_count: col.totalCount - (col.missingCount || 0),
null_count: col.missingCount || 0,
null_ratio: (col.missingRate || 0) / 100,
unique_count: col.uniqueCount || 0,
unique_ratio: col.uniqueCount ? col.uniqueCount / col.totalCount : 0,
sample_values: col.topValues?.slice(0, 5).map((v: any) => v.value) || [],
mean: col.mean,
std: col.std,
min: col.min,
max: col.max,
median: col.median,
q1: col.q1,
q3: col.q3,
skewness: col.skewness,
kurtosis: col.kurtosis,
outlier_count: col.outlierCount,
outlier_ratio: col.outlierRate,
top_categories: col.topValues?.map((v: any) => ({
value: v.value,
count: v.count,
ratio: v.percentage / 100
}))
})),
warnings: quality?.issues || [],
recommendations: quality?.recommendations || [],
generated_at: new Date().toISOString()
};
}
/**
* 基于 Schema 生成简化版 fallback profile
*/
function generateFallbackProfile(schema: any, fileName: string) {
const columns = schema.columns || [];
const rowCount = schema.rowCount || 0;
const numericCols = columns.filter((c: any) => c.type === 'numeric');
const categoricalCols = columns.filter((c: any) => c.type === 'categorical');
const totalMissing = columns.reduce((sum: number, c: any) => sum + (c.nullCount || 0), 0);
const totalCells = rowCount * columns.length;
return {
file_name: fileName,
row_count: rowCount,
column_count: columns.length,
total_cells: totalCells,
missing_cells: totalMissing,
missing_ratio: totalCells > 0 ? totalMissing / totalCells : 0,
duplicate_rows: 0,
duplicate_ratio: 0,
numeric_columns: numericCols.length,
categorical_columns: categoricalCols.length,
datetime_columns: 0,
quality_score: 80,
quality_grade: 'B',
columns: columns.map((col: any) => ({
name: col.name,
dtype: col.type,
inferred_type: col.type,
non_null_count: rowCount - (col.nullCount || 0),
null_count: col.nullCount || 0,
null_ratio: rowCount > 0 ? (col.nullCount || 0) / rowCount : 0,
unique_count: col.uniqueValues || 0,
unique_ratio: rowCount > 0 ? (col.uniqueValues || 0) / rowCount : 0,
sample_values: []
})),
warnings: totalMissing > 0 ? [`数据中存在 ${totalMissing} 个缺失值`] : [],
recommendations: ['建议检查数据完整性后再进行分析'],
generated_at: new Date().toISOString()
};
}

View File

@@ -0,0 +1,369 @@
/**
* SSA Conclusion Generator Service (Phase 2A)
*
* 结论生成器:整合多步骤分析结果,生成论文级结论
*
* 功能:
* - 多步骤结果整合
* - 论文级结论模板
* - 方法学说明 + 局限性声明
*/
import { logger } from '../../../common/logging/index.js';
import { StepResult } from './WorkflowExecutorService.js';
// 结论报告结构
export interface ConclusionReport {
title: string;
summary: string;
sections: ConclusionSection[];
methodology: string;
limitations: string[];
references?: string[];
}
export interface ConclusionSection {
stepOrder: number;
toolName: string;
finding: string;
interpretation: string;
significance: 'significant' | 'not_significant' | 'marginal' | 'na';
details?: Record<string, any>;
}
export class ConclusionGeneratorService {
/**
* 生成综合结论报告
*
* @param results 各步骤执行结果
* @param goal 分析目标
* @returns 结论报告
*/
generateConclusion(results: StepResult[], goal: string): ConclusionReport {
logger.info('[SSA:Conclusion] Generating conclusion', {
stepCount: results.length,
goal
});
const sections: ConclusionSection[] = [];
const successResults = results.filter(r => r.status === 'success' || r.status === 'warning');
for (const result of successResults) {
const section = this.generateSectionConclusion(result);
if (section) {
sections.push(section);
}
}
const summary = this.generateSummary(sections, goal);
const methodology = this.generateMethodology(results);
const limitations = this.generateLimitations(results);
const report: ConclusionReport = {
title: `统计分析报告:${goal}`,
summary,
sections,
methodology,
limitations
};
logger.info('[SSA:Conclusion] Conclusion generated', {
sectionCount: sections.length,
hasLimitations: limitations.length > 0
});
return report;
}
/**
* 生成单步骤结论
*/
private generateSectionConclusion(result: StepResult): ConclusionSection | null {
if (!result.result) {
return null;
}
const { toolCode, toolName, stepOrder } = result;
const data = result.result;
let finding = '';
let interpretation = '';
let significance: ConclusionSection['significance'] = 'na';
switch (toolCode) {
case 'ST_DESCRIPTIVE':
finding = this.formatDescriptiveFindings(data);
interpretation = '上述数据展示了研究样本的基本特征分布。';
break;
case 'ST_T_TEST_IND':
case 'ST_MANN_WHITNEY':
const pValue = data.p_value;
significance = this.interpretPValue(pValue);
finding = this.formatComparisonFindings(data, toolCode);
interpretation = this.interpretComparison(data, significance);
break;
case 'ST_T_TEST_PAIRED':
const pairedP = data.p_value;
significance = this.interpretPValue(pairedP);
finding = this.formatPairedFindings(data);
interpretation = this.interpretPairedResult(data, significance);
break;
case 'ST_CHI_SQUARE':
const chiP = data.p_value;
significance = this.interpretPValue(chiP);
finding = this.formatChiSquareFindings(data);
interpretation = this.interpretChiSquare(data, significance);
break;
case 'ST_CORRELATION':
const corrP = data.p_value;
significance = this.interpretPValue(corrP);
finding = this.formatCorrelationFindings(data);
interpretation = this.interpretCorrelation(data, significance);
break;
case 'ST_LOGISTIC_BINARY':
finding = this.formatLogisticFindings(data);
interpretation = this.interpretLogistic(data);
significance = 'na';
break;
default:
finding = `${toolName} 分析已完成。`;
interpretation = '请参考详细结果解读。';
}
return {
stepOrder,
toolName,
finding,
interpretation,
significance,
details: {
pValue: data.p_value,
pValueFmt: data.p_value_fmt
}
};
}
/**
* 生成总结
*/
private generateSummary(sections: ConclusionSection[], goal: string): string {
const significantFindings = sections.filter(s => s.significance === 'significant');
const marginalFindings = sections.filter(s => s.significance === 'marginal');
let summary = `针对「${goal}」进行了 ${sections.length} 项统计分析。`;
if (significantFindings.length > 0) {
summary += `\n\n主要发现${significantFindings.length} 项分析达到统计学显著性p < 0.05)。`;
for (const finding of significantFindings) {
summary += `\n- ${finding.toolName}${finding.interpretation}`;
}
}
if (marginalFindings.length > 0) {
summary += `\n\n边缘性发现${marginalFindings.length} 项分析接近显著水平0.05 ≤ p < 0.10)。`;
}
if (significantFindings.length === 0) {
summary += '\n\n本次分析未发现具有统计学显著性的差异或关联。';
}
return summary;
}
/**
* 生成方法学说明
*/
private generateMethodology(results: StepResult[]): string {
const methods: string[] = [];
for (const result of results) {
if (result.result?.method) {
methods.push(result.result.method);
}
}
let methodology = '本研究采用以下统计方法进行分析:\n';
const uniqueMethods = [...new Set(methods)];
for (const method of uniqueMethods) {
methodology += `- ${method}\n`;
}
methodology += '\n所有分析均在执行前进行了统计假设检验正态性、方差齐性等并根据检验结果自动选择适当的统计方法。';
methodology += '\n显著性水平设定为 α = 0.05(双侧检验)。';
return methodology;
}
/**
* 生成局限性声明
*/
private generateLimitations(results: StepResult[]): string[] {
const limitations: string[] = [];
// 检查样本量
for (const result of results) {
if (result.result?.group_stats) {
const minN = Math.min(...result.result.group_stats.map((g: any) => g.n || 0));
if (minN < 30) {
limitations.push(`部分分析的样本量较小n < 30可能影响结果的稳健性。`);
break;
}
}
}
// 检查警告
const warnings = results.flatMap(r => r.result?.warnings || []);
if (warnings.length > 0) {
limitations.push(`分析过程中存在统计警告,请谨慎解读结果。`);
}
// 通用局限性
limitations.push('本分析基于横断面数据,无法推断因果关系。');
limitations.push('未考虑潜在的混杂因素,结果可能存在偏倚。');
return limitations;
}
// ==================== 格式化辅助函数 ====================
private formatDescriptiveFindings(data: any): string {
const summary = data.summary;
if (!summary) return '描述性统计已完成。';
return `样本包含 ${summary.n_total || '?'} 个观测值,` +
`${summary.n_numeric || 0} 个数值变量,` +
`${summary.n_categorical || 0} 个分类变量。`;
}
private formatComparisonFindings(data: any, toolCode: string): string {
const stats = data.group_stats || [];
const pFmt = data.p_value_fmt || data.p_value?.toFixed(4);
if (stats.length >= 2) {
const g1 = stats[0];
const g2 = stats[1];
if (toolCode === 'ST_T_TEST_IND') {
return `${g1.group} 组均值为 ${g1.mean?.toFixed(2)} ± ${g1.sd?.toFixed(2)} (n=${g1.n})` +
`${g2.group} 组均值为 ${g2.mean?.toFixed(2)} ± ${g2.sd?.toFixed(2)} (n=${g2.n})` +
`t = ${data.statistic?.toFixed(2)}p ${pFmt}`;
} else {
return `${g1.group} 组中位数为 ${g1.median?.toFixed(2)} (n=${g1.n})` +
`${g2.group} 组中位数为 ${g2.median?.toFixed(2)} (n=${g2.n})` +
`U = ${data.statistic_U?.toFixed(0)}p ${pFmt}`;
}
}
return `两组比较p ${pFmt}`;
}
private formatPairedFindings(data: any): string {
const desc = data.descriptive;
const pFmt = data.p_value_fmt || data.p_value?.toFixed(4);
if (desc) {
return `前测均值 ${desc.before?.mean?.toFixed(2)} ± ${desc.before?.sd?.toFixed(2)}` +
`后测均值 ${desc.after?.mean?.toFixed(2)} ± ${desc.after?.sd?.toFixed(2)}` +
`差值 ${desc.difference?.mean?.toFixed(2)} ± ${desc.difference?.sd?.toFixed(2)}` +
`t = ${data.statistic?.toFixed(2)}p ${pFmt}`;
}
return `配对比较p ${pFmt}`;
}
private formatChiSquareFindings(data: any): string {
const pFmt = data.p_value_fmt || data.p_value?.toFixed(4);
const chi = data.statistic?.toFixed(2);
const df = data.df;
return `χ² = ${chi}df = ${df}p ${pFmt}`;
}
private formatCorrelationFindings(data: any): string {
const r = data.statistic?.toFixed(3);
const pFmt = data.p_value_fmt || data.p_value?.toFixed(4);
const method = data.method_code === 'pearson' ? 'Pearson' : 'Spearman';
return `${method} 相关系数 r = ${r}p ${pFmt}` +
`相关强度:${data.interpretation || '待解读'}`;
}
private formatLogisticFindings(data: any): string {
const coeffs = data.coefficients || [];
const sigCoeffs = coeffs.filter((c: any) => c.significant && c.variable !== '(Intercept)');
if (sigCoeffs.length === 0) {
return 'Logistic 回归分析中未发现统计学显著的预测因子。';
}
const findings = sigCoeffs.slice(0, 3).map((c: any) =>
`${c.variable} (OR=${c.OR}, 95%CI [${c.ci_lower}, ${c.ci_upper}], p ${c.p_value_fmt})`
);
return `多因素分析显示以下因素具有统计学显著性:${findings.join('')}`;
}
// ==================== 解读辅助函数 ====================
private interpretPValue(p: number): ConclusionSection['significance'] {
if (p < 0.05) return 'significant';
if (p < 0.10) return 'marginal';
return 'not_significant';
}
private interpretComparison(data: any, sig: ConclusionSection['significance']): string {
if (sig === 'significant') {
return '两组之间存在统计学显著差异。';
} else if (sig === 'marginal') {
return '两组之间存在边缘显著性差异,建议增加样本量进一步验证。';
}
return '两组之间无统计学显著差异。';
}
private interpretPairedResult(data: any, sig: ConclusionSection['significance']): string {
if (sig === 'significant') {
const diff = data.descriptive?.difference?.mean || 0;
const direction = diff > 0 ? '显著升高' : '显著降低';
return `配对比较结果表明,后测值较前测值${direction}`;
}
return '配对比较未发现统计学显著变化。';
}
private interpretChiSquare(data: any, sig: ConclusionSection['significance']): string {
if (sig === 'significant') {
const v = data.effect_size?.cramers_v;
const strength = v ? `(效应量 Cramér's V = ${v.toFixed(3)}` : '';
return `两个分类变量之间存在统计学显著关联${strength}`;
}
return '两个分类变量之间无统计学显著关联。';
}
private interpretCorrelation(data: any, sig: ConclusionSection['significance']): string {
const r = data.statistic || 0;
const direction = r > 0 ? '正相关' : '负相关';
if (sig === 'significant') {
return `两变量之间存在统计学显著的${direction}`;
}
return '两变量之间不存在统计学显著的线性相关。';
}
private interpretLogistic(data: any): string {
const coeffs = data.coefficients || [];
const sigCount = coeffs.filter((c: any) => c.significant && c.variable !== '(Intercept)').length;
const totalCount = coeffs.filter((c: any) => c.variable !== '(Intercept)').length;
return `在纳入的 ${totalCount} 个自变量中,${sigCount} 个对结局变量具有独立的统计学显著效应。`;
}
}
// 单例导出
export const conclusionGeneratorService = new ConclusionGeneratorService();

View File

@@ -0,0 +1,353 @@
/**
* SSA DataProfile 服务 (Phase 2A)
*
* 调用 Python Tool C 生成数据画像,用于 LLM 生成分析计划
*
* 时机:用户上传数据时(时机 A
* 输出DataProfile JSON存入 SsaSession.dataProfile
*/
import axios, { AxiosInstance } from 'axios';
import { logger } from '../../../common/logging/index.js';
import { prisma } from '../../../config/database.js';
import { storage } from '../../../common/storage/index.js';
export interface DataProfile {
columns: ColumnProfile[];
summary: DataSummary;
}
export interface ColumnProfile {
name: string;
type: 'numeric' | 'categorical' | 'datetime' | 'text';
missingCount: number;
missingRate: number;
uniqueCount: number;
totalCount: number;
// 数值列
mean?: number;
std?: number;
median?: number;
min?: number;
max?: number;
q1?: number;
q3?: number;
iqr?: number;
outlierCount?: number;
outlierRate?: number;
skewness?: number;
kurtosis?: number;
// 分类列
topValues?: Array<{ value: string; count: number; percentage: number }>;
totalLevels?: number;
modeValue?: string;
modeCount?: number;
// 日期列
minDate?: string;
maxDate?: string;
dateRange?: string;
}
export interface DataSummary {
totalRows: number;
totalColumns: number;
numericColumns: number;
categoricalColumns: number;
datetimeColumns: number;
textColumns: number;
overallMissingRate: number;
totalMissingCells: number;
}
export interface QualityScore {
score: number;
grade: 'A' | 'B' | 'C' | 'D';
gradeDescription: string;
issues: string[];
recommendations: string[];
}
export interface DataProfileResult {
success: boolean;
profile?: DataProfile;
quality?: QualityScore;
executionTime?: number;
error?: string;
}
export class DataProfileService {
private client: AxiosInstance;
constructor() {
const baseURL = process.env.EXTRACTION_SERVICE_URL || 'http://localhost:8000';
this.client = axios.create({
baseURL,
timeout: 60000,
headers: {
'Content-Type': 'application/json'
}
});
}
/**
* 为 SSA Session 生成数据画像
*
* @param sessionId SSA 会话 ID
* @param data 数据数组JSON 格式)
* @returns DataProfile 结果
*/
async generateProfile(sessionId: string, data: Record<string, any>[]): Promise<DataProfileResult> {
const startTime = Date.now();
try {
logger.info('[SSA:DataProfile] Generating profile', {
sessionId,
rowCount: data.length,
columnCount: data.length > 0 ? Object.keys(data[0]).length : 0
});
const response = await this.client.post('/api/ssa/data-profile', {
data,
max_unique_values: 20,
include_quality_score: true
});
if (!response.data.success) {
throw new Error(response.data.error || 'Profile generation failed');
}
const result: DataProfileResult = {
success: true,
profile: response.data.profile,
quality: response.data.quality,
executionTime: response.data.execution_time
};
// 保存到数据库
await this.saveProfileToSession(sessionId, result);
const executionMs = Date.now() - startTime;
logger.info('[SSA:DataProfile] Profile generated successfully', {
sessionId,
executionMs,
summary: result.profile?.summary
});
return result;
} catch (error: any) {
const executionMs = Date.now() - startTime;
logger.error('[SSA:DataProfile] Profile generation failed', {
sessionId,
error: error.message,
executionMs
});
return {
success: false,
error: error.message,
executionTime: executionMs / 1000
};
}
}
/**
* 从 CSV 内容直接生成画像(让 Python pandas 解析 CSV
*
* @param sessionId SSA 会话 ID
* @param csvContent CSV 文件内容
* @returns DataProfile 结果
*/
async generateProfileFromCSV(sessionId: string, csvContent: string): Promise<DataProfileResult> {
const startTime = Date.now();
try {
logger.info('[SSA:DataProfile] Generating profile from CSV', {
sessionId,
contentLength: csvContent.length
});
// 直接发送 CSV 内容给 Python 服务,让 pandas 解析
const response = await this.client.post('/api/ssa/data-profile-csv', {
csv_content: csvContent,
max_unique_values: 20,
include_quality_score: true
});
if (!response.data.success) {
throw new Error(response.data.error || 'Profile generation failed');
}
const result: DataProfileResult = {
success: true,
profile: response.data.profile,
quality: response.data.quality,
executionTime: response.data.execution_time
};
// 保存到数据库
await this.saveProfileToSession(sessionId, result);
const executionMs = Date.now() - startTime;
logger.info('[SSA:DataProfile] Profile generated from CSV successfully', {
sessionId,
executionMs,
summary: result.profile?.summary
});
return result;
} catch (error: any) {
const executionMs = Date.now() - startTime;
logger.error('[SSA:DataProfile] CSV profile generation failed', {
sessionId,
error: error.message,
executionMs
});
return {
success: false,
error: error.message,
executionTime: executionMs / 1000
};
}
}
/**
* 从 OSS 加载数据并生成画像
*
* @param sessionId SSA 会话 ID
* @returns DataProfile 结果
*/
async generateProfileFromSession(sessionId: string): Promise<DataProfileResult> {
try {
const session = await prisma.ssaSession.findUnique({
where: { id: sessionId }
});
if (!session) {
throw new Error(`Session not found: ${sessionId}`);
}
// 如果已有画像,直接返回
if (session.dataProfile) {
logger.info('[SSA:DataProfile] Using cached profile', { sessionId });
return {
success: true,
profile: session.dataProfile as unknown as DataProfile
};
}
// 从 dataPayload 或 OSS 加载数据
if (session.dataPayload) {
// JSON 格式数据,直接调用原方法
const data = session.dataPayload as unknown as Record<string, any>[];
return await this.generateProfile(sessionId, data);
} else if (session.dataOssKey) {
// 从 OSS 下载文件
const buffer = await storage.download(session.dataOssKey);
const content = buffer.toString('utf-8');
// 检测文件格式JSON 或 CSV
const trimmedContent = content.trim();
if (trimmedContent.startsWith('[') || trimmedContent.startsWith('{')) {
// JSON 格式
const data = JSON.parse(content);
return await this.generateProfile(sessionId, data);
} else {
// CSV 格式,直接发给 Python 解析(更高效、更可靠)
return await this.generateProfileFromCSV(sessionId, content);
}
} else {
throw new Error('No data available for session');
}
} catch (error: any) {
logger.error('[SSA:DataProfile] Failed to generate profile from session', {
sessionId,
error: error.message
});
return {
success: false,
error: error.message
};
}
}
/**
* 保存画像到 Session
*/
private async saveProfileToSession(sessionId: string, result: DataProfileResult): Promise<void> {
try {
await prisma.ssaSession.update({
where: { id: sessionId },
data: {
dataProfile: result.profile as any
}
});
logger.info('[SSA:DataProfile] Profile saved to session', { sessionId });
} catch (error: any) {
logger.error('[SSA:DataProfile] Failed to save profile', {
sessionId,
error: error.message
});
}
}
/**
* 获取已缓存的画像
*/
async getCachedProfile(sessionId: string): Promise<DataProfile | null> {
const session = await prisma.ssaSession.findUnique({
where: { id: sessionId },
select: { dataProfile: true }
});
return session?.dataProfile as unknown as DataProfile | null;
}
/**
* 为 LLM 生成精简版画像摘要
* 用于 Prompt 注入,控制 Token 消耗
*/
generateProfileSummaryForLLM(profile: DataProfile): string {
const { summary, columns } = profile;
const lines: string[] = [
`## 数据概况`,
`- 样本量: ${summary.totalRows}`,
`- 变量数: ${summary.totalColumns} 列 (${summary.numericColumns} 数值, ${summary.categoricalColumns} 分类)`,
`- 整体缺失率: ${summary.overallMissingRate}%`,
'',
`## 变量清单`
];
for (const col of columns) {
let desc = `- **${col.name}** [${col.type}]`;
if (col.missingRate > 0) {
desc += ` (缺失 ${col.missingRate}%)`;
}
if (col.type === 'numeric') {
desc += `: 均值=${col.mean}, SD=${col.std}, 范围=[${col.min}, ${col.max}]`;
if (col.outlierCount && col.outlierCount > 0) {
desc += `, ${col.outlierCount}个异常值`;
}
} else if (col.type === 'categorical') {
const levels = col.topValues?.slice(0, 5).map(v => v.value).join(', ');
desc += `: ${col.totalLevels}个水平 (${levels}${col.totalLevels && col.totalLevels > 5 ? '...' : ''})`;
}
lines.push(desc);
}
return lines.join('\n');
}
}
// 单例导出
export const dataProfileService = new DataProfileService();

View File

@@ -0,0 +1,521 @@
/**
* SSA Workflow Executor Service (Phase 2A)
*
* 流程执行器:串联执行多个统计工具
*
* 功能:
* - 按顺序执行工作流步骤
* - JIT 护栏检查(执行前)
* - 结果在步骤间传递
* - 容错管道:支持部分成功
* - SSE 实时进度推送
*/
import { EventEmitter } from 'events';
import axios, { AxiosInstance } from 'axios';
import { logger } from '../../../common/logging/index.js';
import { prisma } from '../../../config/database.js';
import { storage } from '../../../common/storage/index.js';
import { WorkflowStep, ToolCode, AVAILABLE_TOOLS } from './WorkflowPlannerService.js';
import { conclusionGeneratorService, ConclusionReport } from './ConclusionGeneratorService.js';
// 步骤执行结果
export interface StepResult {
stepOrder: number;
toolCode: string;
toolName: string;
status: 'success' | 'warning' | 'error' | 'skipped';
result?: any;
guardrailChecks?: GuardrailCheck[];
error?: {
code: string;
message: string;
userHint: string;
};
executionMs: number;
}
// 护栏检查结果
export interface GuardrailCheck {
checkName: string;
passed: boolean;
pValue?: number;
recommendation: string;
}
// SSE 消息格式
export interface SSEMessage {
type: 'step_start' | 'step_progress' | 'step_complete' | 'step_error' | 'workflow_complete';
step: number;
total_steps?: number;
toolCode: string;
toolName: string;
status: 'running' | 'success' | 'error' | 'skipped' | 'warning';
message: string;
progress?: number;
durationMs?: number;
result?: any;
error?: {
code: string;
message: string;
userHint: string;
};
timestamp: string;
}
// 工作流执行结果
export interface WorkflowExecutionResult {
workflowId: string;
status: 'completed' | 'partial' | 'error';
totalSteps: number;
completedSteps: number;
successSteps: number;
results: StepResult[];
conclusion?: ConclusionReport;
executionMs: number;
}
export class WorkflowExecutorService extends EventEmitter {
private rClient: AxiosInstance;
constructor() {
super();
const rServiceUrl = process.env.R_SERVICE_URL || 'http://localhost:8082';
this.rClient = axios.create({
baseURL: rServiceUrl,
timeout: 120000,
headers: {
'Content-Type': 'application/json'
}
});
}
/**
* 执行工作流
*
* @param workflowId 工作流 ID
* @param sessionId 会话 ID
* @returns 执行结果
*/
async executeWorkflow(workflowId: string, sessionId: string): Promise<WorkflowExecutionResult> {
const startTime = Date.now();
const results: StepResult[] = [];
logger.info('[SSA:Executor] Starting workflow execution', { workflowId, sessionId });
try {
// 获取工作流和步骤
const workflow = await prisma.ssaWorkflow.findUnique({
where: { id: workflowId },
include: { steps: { orderBy: { stepOrder: 'asc' } } }
});
if (!workflow) {
throw new Error(`Workflow not found: ${workflowId}`);
}
// 获取会话数据
const session = await prisma.ssaSession.findUnique({
where: { id: sessionId }
});
if (!session) {
throw new Error(`Session not found: ${sessionId}`);
}
// 更新工作流状态
await prisma.ssaWorkflow.update({
where: { id: workflowId },
data: {
status: 'running',
startedAt: new Date()
}
});
// 准备数据源
const dataSource = await this.prepareDataSource(session);
// 逐步执行
let successCount = 0;
let previousResults: any = null;
for (const step of workflow.steps) {
const stepResult = await this.executeStep(
step,
session,
dataSource,
previousResults
);
results.push(stepResult);
// 更新步骤状态
await prisma.ssaWorkflowStep.update({
where: { id: step.id },
data: {
status: stepResult.status,
outputResult: stepResult.result,
guardrailChecks: stepResult.guardrailChecks as any,
errorInfo: stepResult.error as any,
executionMs: stepResult.executionMs,
completedAt: new Date()
}
});
// 更新工作流进度
await prisma.ssaWorkflow.update({
where: { id: workflowId },
data: { completedSteps: { increment: 1 } }
});
if (stepResult.status === 'success' || stepResult.status === 'warning') {
successCount++;
previousResults = stepResult.result;
}
// 发送 SSE 消息
this.emitProgress({
type: stepResult.status === 'error' ? 'step_error' : 'step_complete',
step: step.stepOrder,
total_steps: workflow.steps.length,
toolCode: step.toolCode,
toolName: step.toolName,
status: stepResult.status,
message: stepResult.status === 'error'
? `${step.toolName} 执行失败: ${stepResult.error?.message}`
: `${step.toolName} 执行完成`,
result: stepResult.result,
durationMs: stepResult.executionMs,
error: stepResult.error,
timestamp: new Date().toISOString()
});
// 如果是关键错误,决定是否继续
if (stepResult.status === 'error' && this.isCriticalStep(step.stepOrder, workflow.steps.length)) {
logger.warn('[SSA:Executor] Critical step failed, stopping workflow', {
workflowId,
step: step.stepOrder
});
break;
}
}
// 确定最终状态
const executionMs = Date.now() - startTime;
let finalStatus: 'completed' | 'partial' | 'error' = 'completed';
if (successCount === 0) {
finalStatus = 'error';
} else if (successCount < workflow.steps.length) {
finalStatus = 'partial';
}
// 更新工作流最终状态
await prisma.ssaWorkflow.update({
where: { id: workflowId },
data: {
status: finalStatus,
completedAt: new Date()
}
});
// 发送完成消息
this.emitProgress({
type: 'workflow_complete',
step: workflow.steps.length,
toolCode: '',
toolName: '',
status: finalStatus === 'completed' ? 'success' : finalStatus === 'partial' ? 'warning' : 'error',
message: finalStatus === 'completed'
? `分析流程执行完成,共 ${successCount} 个步骤`
: `分析流程部分完成,${successCount}/${workflow.steps.length} 个步骤成功`,
timestamp: new Date().toISOString()
});
// 生成综合结论
let conclusion: ConclusionReport | undefined;
if (successCount > 0) {
const workflowPlan = workflow.workflowPlan as any;
conclusion = conclusionGeneratorService.generateConclusion(
results,
workflowPlan?.goal || '统计分析'
);
}
logger.info('[SSA:Executor] Workflow execution finished', {
workflowId,
status: finalStatus,
successCount,
totalSteps: workflow.steps.length,
executionMs,
hasConclusion: !!conclusion
});
return {
workflowId,
status: finalStatus,
totalSteps: workflow.steps.length,
completedSteps: results.length,
successSteps: successCount,
results,
conclusion,
executionMs
};
} catch (error: any) {
logger.error('[SSA:Executor] Workflow execution failed', {
workflowId,
error: error.message
});
await prisma.ssaWorkflow.update({
where: { id: workflowId },
data: { status: 'error' }
});
return {
workflowId,
status: 'error',
totalSteps: 0,
completedSteps: 0,
successSteps: 0,
results,
executionMs: Date.now() - startTime
};
}
}
/**
* 执行单个步骤
*/
private async executeStep(
step: any,
session: any,
dataSource: any,
previousResults: any
): Promise<StepResult> {
const startTime = Date.now();
// 发送开始消息
this.emitProgress({
type: 'step_start',
step: step.stepOrder,
toolCode: step.toolCode,
toolName: step.toolName,
status: 'running',
message: `正在执行 ${step.toolName}...`,
timestamp: new Date().toISOString()
});
// 更新步骤状态
await prisma.ssaWorkflowStep.update({
where: { id: step.id },
data: {
status: 'running',
startedAt: new Date()
}
});
try {
// JIT 护栏检查
let guardrailChecks: GuardrailCheck[] | undefined;
if (this.needsGuardrailCheck(step.toolCode)) {
this.emitProgress({
type: 'step_progress',
step: step.stepOrder,
toolCode: step.toolCode,
toolName: step.toolName,
status: 'running',
message: '正在执行统计假设检验JIT护栏...',
progress: 30,
timestamp: new Date().toISOString()
});
guardrailChecks = await this.runJITGuardrails(dataSource, step.toolCode, step.inputParams);
}
// 发送进度
this.emitProgress({
type: 'step_progress',
step: step.stepOrder,
toolCode: step.toolCode,
toolName: step.toolName,
status: 'running',
message: `正在执行 ${step.toolName}...`,
progress: 60,
timestamp: new Date().toISOString()
});
// 调用 R 服务
const response = await this.rClient.post(`/api/v1/skills/${step.toolCode}`, {
data_source: dataSource,
params: step.inputParams,
original_filename: session.title || 'data.csv',
guardrails: {
check_normality: true,
auto_fix: true
}
});
const executionMs = Date.now() - startTime;
if (response.data.status === 'error' || response.data.status === 'blocked') {
return {
stepOrder: step.stepOrder,
toolCode: step.toolCode,
toolName: step.toolName,
status: 'error',
guardrailChecks,
error: {
code: response.data.error_code || 'E100',
message: response.data.message || '执行失败',
userHint: response.data.user_hint || '请检查数据和参数'
},
executionMs
};
}
return {
stepOrder: step.stepOrder,
toolCode: step.toolCode,
toolName: step.toolName,
status: response.data.warnings?.length > 0 ? 'warning' : 'success',
result: {
...response.data.results,
plots: response.data.plots,
result_table: response.data.result_table,
reproducible_code: response.data.reproducible_code,
trace_log: response.data.trace_log,
warnings: response.data.warnings,
},
guardrailChecks,
executionMs
};
} catch (error: any) {
const executionMs = Date.now() - startTime;
logger.error('[SSA:Executor] Step execution failed', {
step: step.stepOrder,
toolCode: step.toolCode,
error: error.message
});
return {
stepOrder: step.stepOrder,
toolCode: step.toolCode,
toolName: step.toolName,
status: 'error',
error: {
code: 'E100',
message: error.message,
userHint: '执行过程中发生错误,请重试'
},
executionMs
};
}
}
/**
* JIT 护栏检查
*/
private async runJITGuardrails(
dataSource: any,
toolCode: string,
params: any
): Promise<GuardrailCheck[]> {
try {
const response = await this.rClient.post('/api/v1/guardrails/jit', {
data_source: dataSource,
tool_code: toolCode,
params
});
if (response.data.status === 'success') {
return response.data.checks || [];
}
} catch (error: any) {
logger.warn('[SSA:Executor] JIT guardrail check failed', {
toolCode,
error: error.message
});
}
return [];
}
/**
* 判断是否需要护栏检查
*/
private needsGuardrailCheck(toolCode: string): boolean {
const toolsNeedingGuardrails = [
'ST_T_TEST_IND',
'ST_T_TEST_PAIRED',
'ST_CORRELATION'
];
return toolsNeedingGuardrails.includes(toolCode);
}
/**
* 判断是否是关键步骤
*/
private isCriticalStep(stepOrder: number, totalSteps: number): boolean {
// 第一步(描述统计)失败才算关键错误
return stepOrder === 1;
}
/**
* 准备数据源
*/
private async prepareDataSource(session: any): Promise<any> {
if (session.dataPayload) {
return {
type: 'inline',
data: session.dataPayload
};
} else if (session.dataOssKey) {
const signedUrl = await storage.getUrl(session.dataOssKey, 3600);
return {
type: 'oss',
oss_url: signedUrl
};
}
throw new Error('No data source available');
}
/**
* 发送进度消息
*/
private emitProgress(message: SSEMessage): void {
this.emit('progress', message);
logger.debug('[SSA:Executor] Progress emitted', {
type: message.type,
step: message.step,
status: message.status
});
}
/**
* 获取工作流执行状态
*/
async getWorkflowStatus(workflowId: string): Promise<any> {
const workflow = await prisma.ssaWorkflow.findUnique({
where: { id: workflowId },
include: {
steps: {
orderBy: { stepOrder: 'asc' }
}
}
});
return workflow;
}
}
// 单例导出
export const workflowExecutorService = new WorkflowExecutorService();

View File

@@ -0,0 +1,603 @@
/**
* SSA Workflow Planner Service (Phase 2A)
*
* 路径规划器LLM 驱动的多工具流程规划
*
* 功能:
* - 理解用户意图 + 数据特征
* - 规划 2-7 步分析流程
* - 选择合适的统计工具组合
*/
import { logger } from '../../../common/logging/index.js';
import { prisma } from '../../../config/database.js';
import { DataProfile, dataProfileService } from './DataProfileService.js';
// 可用工具定义
export const AVAILABLE_TOOLS = {
ST_DESCRIPTIVE: {
code: 'ST_DESCRIPTIVE',
name: '描述性统计',
category: 'basic',
description: '数据概况、基线特征表',
inputParams: ['variables', 'group_var?'],
outputType: 'summary'
},
ST_T_TEST_IND: {
code: 'ST_T_TEST_IND',
name: '独立样本T检验',
category: 'parametric',
description: '两组连续变量比较(参数方法)',
inputParams: ['group_var', 'value_var'],
outputType: 'comparison',
prerequisite: '正态分布',
fallback: 'ST_MANN_WHITNEY'
},
ST_MANN_WHITNEY: {
code: 'ST_MANN_WHITNEY',
name: 'Mann-Whitney U检验',
category: 'nonparametric',
description: '两组连续/等级变量比较(非参数方法)',
inputParams: ['group_var', 'value_var'],
outputType: 'comparison'
},
ST_T_TEST_PAIRED: {
code: 'ST_T_TEST_PAIRED',
name: '配对T检验',
category: 'parametric',
description: '配对设计的前后对比',
inputParams: ['before_var', 'after_var'],
outputType: 'comparison'
},
ST_CHI_SQUARE: {
code: 'ST_CHI_SQUARE',
name: '卡方检验',
category: 'categorical',
description: '两个分类变量的独立性检验',
inputParams: ['var1', 'var2'],
outputType: 'association'
},
ST_CORRELATION: {
code: 'ST_CORRELATION',
name: '相关分析',
category: 'correlation',
description: 'Pearson/Spearman相关系数',
inputParams: ['var_x', 'var_y', 'method?'],
outputType: 'correlation'
},
ST_LOGISTIC_BINARY: {
code: 'ST_LOGISTIC_BINARY',
name: '二元Logistic回归',
category: 'regression',
description: '二分类结局的多因素分析',
inputParams: ['outcome_var', 'predictors', 'confounders?'],
outputType: 'regression'
}
} as const;
export type ToolCode = keyof typeof AVAILABLE_TOOLS;
// 工作流步骤
export interface WorkflowStep {
stepOrder: number;
toolCode: ToolCode;
toolName: string;
inputParams: Record<string, any>;
purpose: string;
dependsOn?: number[];
}
// 工作流计划(内部使用)
export interface WorkflowPlanInternal {
goal: string;
reasoning: string;
steps: WorkflowStep[];
estimatedDuration: string;
}
// 工作流计划API 返回格式,与前端类型匹配)
export interface WorkflowPlan {
workflow_id: string;
session_id: string;
title: string;
description: string;
total_steps: number;
steps: Array<{
step_number: number;
tool_code: string;
tool_name: string;
description: string;
params: Record<string, unknown>;
depends_on?: number[];
}>;
estimated_time_seconds?: number;
created_at: string;
}
// 用户意图解析结果
export interface ParsedIntent {
goal: string;
analysisType: 'comparison' | 'correlation' | 'regression' | 'descriptive' | 'mixed';
variables: {
mentioned?: string[]; // 用户在查询中提到的变量
outcome?: string; // 结局变量
predictors?: string[]; // 预测变量/自变量
grouping?: string; // 分组变量
continuous?: string[]; // 所有连续变量
categorical?: string[]; // 所有分类变量
};
design?: 'independent' | 'paired' | 'longitudinal';
}
export class WorkflowPlannerService {
/**
* 生成多步骤工作流计划
*
* @param sessionId 会话 ID
* @param userQuery 用户的分析请求
* @param profile 数据画像(可选,如果不传会自动获取)
*/
async planWorkflow(
sessionId: string,
userQuery: string,
profile?: DataProfile
): Promise<WorkflowPlan> {
logger.info('[SSA:Planner] Planning workflow', { sessionId, userQuery });
// 获取数据画像
if (!profile) {
profile = await dataProfileService.getCachedProfile(sessionId) || undefined;
}
// 解析用户意图
const intent = this.parseUserIntent(userQuery, profile);
// 根据意图生成工作流
const steps = this.generateSteps(intent, profile);
// 构建内部计划
const internalPlan: WorkflowPlanInternal = {
goal: intent.goal,
reasoning: this.generateReasoning(intent, steps),
steps,
estimatedDuration: this.estimateDuration(steps)
};
// 保存到数据库
const workflowId = await this.saveWorkflow(sessionId, internalPlan);
logger.info('[SSA:Planner] Workflow planned', {
sessionId,
stepCount: steps.length,
tools: steps.map(s => s.toolCode)
});
// 转换为前端期望的格式
const plan: WorkflowPlan = {
workflow_id: workflowId,
session_id: sessionId,
title: intent.goal,
description: internalPlan.reasoning,
total_steps: steps.length,
steps: steps.map(s => ({
step_number: s.stepOrder,
tool_code: s.toolCode,
tool_name: s.toolName,
description: s.purpose,
params: s.inputParams,
depends_on: s.dependsOn
})),
estimated_time_seconds: steps.length * 5,
created_at: new Date().toISOString()
};
return plan;
}
/**
* 解析用户意图(改进版:识别用户提到的变量并选择合适方法)
*/
private parseUserIntent(userQuery: string, profile?: DataProfile): ParsedIntent {
const query = userQuery.toLowerCase();
// 基于关键词的意图识别
let analysisType: ParsedIntent['analysisType'] = 'descriptive';
let design: ParsedIntent['design'] = 'independent';
if (query.includes('比较') || query.includes('差异') || query.includes('不同')) {
analysisType = 'comparison';
} else if (query.includes('相关') || query.includes('关系') || query.includes('关联')) {
analysisType = 'correlation';
} else if (query.includes('影响') || query.includes('因素') || query.includes('预测') || query.includes('回归')) {
analysisType = 'regression';
}
if (query.includes('前后') || query.includes('配对') || query.includes('变化')) {
design = 'paired';
}
// 从用户查询中提取变量名
const variables: ParsedIntent['variables'] = {
mentioned: [], // 用户提到的变量
outcome: undefined, // 结局变量
predictors: [], // 预测变量/自变量
continuous: [],
categorical: []
};
if (profile) {
const allColumns = profile.columns.map(c => c.name);
const numericCols = profile.columns.filter(c => c.type === 'numeric').map(c => c.name);
const categoricalCols = profile.columns.filter(c => c.type === 'categorical').map(c => c.name);
variables.continuous = numericCols;
variables.categorical = categoricalCols;
// 从查询中识别用户提到的变量名(不区分大小写)
for (const col of allColumns) {
if (query.includes(col.toLowerCase())) {
variables.mentioned!.push(col);
}
}
// 尝试识别结局变量和预测变量
// 规则A对B的影响 / A与B的相关性 → B 是结局A 是预测因素
const influenceMatch = userQuery.match(/(.+?)(?:对|影响|预测)(.+?)(?:的|$)/);
const correlationMatch = userQuery.match(/(.+?)(?:与|和|跟)(.+?)(?:的相关|的关系|的关联)/);
if (influenceMatch) {
const predictorPart = influenceMatch[1];
const outcomePart = influenceMatch[2];
// 找出结局变量
for (const col of allColumns) {
if (outcomePart.toLowerCase().includes(col.toLowerCase())) {
variables.outcome = col;
break;
}
}
// 找出预测变量
for (const col of allColumns) {
if (predictorPart.toLowerCase().includes(col.toLowerCase())) {
variables.predictors!.push(col);
}
}
} else if (correlationMatch) {
const var1Part = correlationMatch[1];
const var2Part = correlationMatch[2];
for (const col of allColumns) {
if (var1Part.toLowerCase().includes(col.toLowerCase()) ||
var2Part.toLowerCase().includes(col.toLowerCase())) {
variables.mentioned!.push(col);
}
}
}
// 如果有明确提到的变量但没有解析出结局/预测,使用提到的变量
if (variables.mentioned!.length >= 2 && !variables.outcome) {
// 最后一个通常是结局变量
variables.outcome = variables.mentioned![variables.mentioned!.length - 1];
variables.predictors = variables.mentioned!.slice(0, -1);
}
// 尝试识别分组变量(二分类)
const binaryCol = profile.columns.find(c => c.type === 'categorical' && c.totalLevels === 2);
if (binaryCol) {
variables.grouping = binaryCol.name;
}
logger.info('[WorkflowPlanner] Parsed variables from query', {
mentioned: variables.mentioned,
outcome: variables.outcome,
predictors: variables.predictors
});
}
return {
goal: userQuery,
analysisType,
design,
variables
};
}
/**
* 判断变量是否为分类型
*/
private isVariableCategorical(varName: string, profile?: DataProfile): boolean {
if (!profile) return false;
const col = profile.columns.find(c => c.name.toLowerCase() === varName.toLowerCase());
return col?.type === 'categorical';
}
/**
* 判断变量是否为二分类
*/
private isVariableBinary(varName: string, profile?: DataProfile): boolean {
if (!profile) return false;
const col = profile.columns.find(c => c.name.toLowerCase() === varName.toLowerCase());
return col?.type === 'categorical' && col.totalLevels === 2;
}
/**
* 根据意图生成工作流步骤(改进版:根据变量类型智能选择方法)
*/
private generateSteps(intent: ParsedIntent, profile?: DataProfile): WorkflowStep[] {
const steps: WorkflowStep[] = [];
let stepOrder = 1;
// 获取用户提到的变量
const mentionedVars = intent.variables?.mentioned || [];
const outcomeVar = intent.variables?.outcome;
const predictorVars = intent.variables?.predictors || [];
// 第一步:总是先做描述性统计
const descVars = mentionedVars.length > 0
? mentionedVars
: (intent.variables?.continuous || []).slice(0, 5);
steps.push({
stepOrder: stepOrder++,
toolCode: 'ST_DESCRIPTIVE',
toolName: AVAILABLE_TOOLS.ST_DESCRIPTIVE.name,
inputParams: {
variables: descVars,
group_var: intent.variables?.grouping
},
purpose: '了解数据的基本特征和分布'
});
// 根据分析类型和变量类型添加核心分析步骤
switch (intent.analysisType) {
case 'comparison':
if (intent.design === 'paired') {
// 配对设计
if (intent.variables?.continuous && intent.variables.continuous.length >= 2) {
steps.push({
stepOrder: stepOrder++,
toolCode: 'ST_T_TEST_PAIRED',
toolName: AVAILABLE_TOOLS.ST_T_TEST_PAIRED.name,
inputParams: {
before_var: intent.variables.continuous[0],
after_var: intent.variables.continuous[1]
},
purpose: '检验配对样本的均值差异',
dependsOn: [1]
});
}
} else {
// 独立样本设计
if (intent.variables?.grouping && intent.variables?.continuous?.length) {
steps.push({
stepOrder: stepOrder++,
toolCode: 'ST_T_TEST_IND',
toolName: AVAILABLE_TOOLS.ST_T_TEST_IND.name,
inputParams: {
group_var: intent.variables.grouping,
value_var: intent.variables.continuous[0]
},
purpose: '检验两组均值是否存在显著差异正态时用T检验否则自动降级为Mann-Whitney',
dependsOn: [1]
});
}
}
break;
case 'correlation':
// 根据变量类型选择相关性分析方法
if (mentionedVars.length >= 2) {
const var1 = mentionedVars[0];
const var2 = mentionedVars[1];
const var1IsCat = this.isVariableCategorical(var1, profile);
const var2IsCat = this.isVariableCategorical(var2, profile);
if (var1IsCat && var2IsCat) {
// 两个分类变量 → 卡方检验
steps.push({
stepOrder: stepOrder++,
toolCode: 'ST_CHI_SQUARE',
toolName: AVAILABLE_TOOLS.ST_CHI_SQUARE.name,
inputParams: {
var1: var1,
var2: var2
},
purpose: `分析 ${var1}${var2} 两个分类变量的关联性`,
dependsOn: [1]
});
} else if (!var1IsCat && !var2IsCat) {
// 两个连续变量 → Pearson/Spearman 相关
steps.push({
stepOrder: stepOrder++,
toolCode: 'ST_CORRELATION',
toolName: AVAILABLE_TOOLS.ST_CORRELATION.name,
inputParams: {
var_x: var1,
var_y: var2,
method: 'auto'
},
purpose: `分析 ${var1}${var2} 的相关性`,
dependsOn: [1]
});
} else {
// 一个分类一个连续 → T检验或点双列相关
const catVar = var1IsCat ? var1 : var2;
const contVar = var1IsCat ? var2 : var1;
if (this.isVariableBinary(catVar, profile)) {
steps.push({
stepOrder: stepOrder++,
toolCode: 'ST_T_TEST_IND',
toolName: AVAILABLE_TOOLS.ST_T_TEST_IND.name,
inputParams: {
group_var: catVar,
value_var: contVar
},
purpose: `比较 ${catVar} 不同组别下 ${contVar} 的差异(点双列相关的等价检验)`,
dependsOn: [1]
});
}
}
} else if (intent.variables?.continuous && intent.variables.continuous.length >= 2) {
// 没有明确提到变量,使用默认的连续变量
steps.push({
stepOrder: stepOrder++,
toolCode: 'ST_CORRELATION',
toolName: AVAILABLE_TOOLS.ST_CORRELATION.name,
inputParams: {
var_x: intent.variables.continuous[0],
var_y: intent.variables.continuous[1],
method: 'auto'
},
purpose: '分析两个连续变量的相关性',
dependsOn: [1]
});
}
break;
case 'regression':
// 多因素分析 - 使用用户指定的结局变量和预测因素
const regressionOutcome = outcomeVar || intent.variables?.grouping;
const regressionPredictors = predictorVars.length > 0
? predictorVars
: intent.variables?.continuous?.slice(0, 5) || [];
if (regressionOutcome && regressionPredictors.length > 0) {
// 判断结局变量类型
const outcomeBinary = this.isVariableBinary(regressionOutcome, profile);
const outcomeCat = this.isVariableCategorical(regressionOutcome, profile);
logger.info('[WorkflowPlanner] Regression analysis', {
outcome: regressionOutcome,
predictors: regressionPredictors,
outcomeBinary,
outcomeCat
});
if (outcomeBinary || outcomeCat) {
// 二分类/分类结局 → Logistic 回归
steps.push({
stepOrder: stepOrder++,
toolCode: 'ST_LOGISTIC_BINARY',
toolName: AVAILABLE_TOOLS.ST_LOGISTIC_BINARY.name,
inputParams: {
outcome_var: regressionOutcome,
predictors: regressionPredictors
},
purpose: `分析 ${regressionPredictors.join('、')}${regressionOutcome} 的影响(二元 Logistic 回归)`,
dependsOn: [1]
});
} else {
// 连续结局 → 暂时也使用 Logistic 回归TODO: 添加线性回归工具)
// 实际应该使用线性回归,但当前工具库暂未支持
logger.warn('[WorkflowPlanner] Linear regression not yet implemented, falling back to descriptive stats');
// 添加一个额外的描述性统计步骤作为替代
steps.push({
stepOrder: stepOrder++,
toolCode: 'ST_CORRELATION',
toolName: AVAILABLE_TOOLS.ST_CORRELATION.name,
inputParams: {
var_x: regressionPredictors[0],
var_y: regressionOutcome,
method: 'auto'
},
purpose: `分析 ${regressionPredictors[0]}${regressionOutcome} 的相关性(线性回归待开发)`,
dependsOn: [1]
});
}
} else if (intent.variables?.grouping && intent.variables?.continuous?.length) {
// 降级:使用默认的分组变量作为结局
steps.push({
stepOrder: stepOrder++,
toolCode: 'ST_LOGISTIC_BINARY',
toolName: AVAILABLE_TOOLS.ST_LOGISTIC_BINARY.name,
inputParams: {
outcome_var: intent.variables.grouping,
predictors: intent.variables.continuous?.slice(0, 5) || []
},
purpose: '多因素分析:控制混杂后分析各因素的独立效应',
dependsOn: [1]
});
}
break;
}
return steps;
}
/**
* 生成规划理由说明
*/
private generateReasoning(intent: ParsedIntent, steps: WorkflowStep[]): string {
const reasons: string[] = [];
reasons.push(`根据您的分析目标「${intent.goal}」,我为您规划了 ${steps.length} 步分析流程:`);
for (const step of steps) {
reasons.push(`${step.stepOrder}. ${step.toolName}${step.purpose}`);
}
if (intent.analysisType === 'comparison') {
reasons.push('\n说明系统会自动进行正态性检验如不满足正态性假设将自动切换为非参数方法。');
}
return reasons.join('\n');
}
/**
* 估算执行时长
*/
private estimateDuration(steps: WorkflowStep[]): string {
const secondsPerStep = 5;
const totalSeconds = steps.length * secondsPerStep;
if (totalSeconds < 60) {
return `${totalSeconds}`;
} else {
return `${Math.ceil(totalSeconds / 60)} 分钟`;
}
}
/**
* 保存工作流到数据库
*/
private async saveWorkflow(sessionId: string, plan: WorkflowPlanInternal): Promise<string> {
const workflow = await prisma.ssaWorkflow.create({
data: {
sessionId,
status: 'pending',
totalSteps: plan.steps.length,
completedSteps: 0,
workflowPlan: plan as any,
reasoning: plan.reasoning
}
});
// 创建步骤记录
for (const step of plan.steps) {
await prisma.ssaWorkflowStep.create({
data: {
workflowId: workflow.id,
stepOrder: step.stepOrder,
toolCode: step.toolCode,
toolName: step.toolName,
status: 'pending',
inputParams: step.inputParams
}
});
}
logger.info('[SSA:Planner] Workflow saved', {
sessionId,
workflowId: workflow.id,
stepCount: plan.steps.length
});
return workflow.id;
}
}
// 单例导出
export const workflowPlannerService = new WorkflowPlannerService();