feat(ssa): Complete Phase I-IV intelligent dialogue and tool system development
Phase I - Session Blackboard + READ Layer: - SessionBlackboardService with Postgres-Only cache - DataProfileService for data overview generation - PicoInferenceService for LLM-driven PICO extraction - Frontend DataContextCard and VariableDictionaryPanel - E2E tests: 31/31 passed Phase II - Conversation Layer LLM + Intent Router: - ConversationService with SSE streaming - IntentRouterService (rule-first + LLM fallback, 6 intents) - SystemPromptService with 6-segment dynamic assembly - TokenTruncationService for context management - ChatHandlerService as unified chat entry - Frontend SSAChatPane and useSSAChat hook - E2E tests: 38/38 passed Phase III - Method Consultation + AskUser Standardization: - ToolRegistryService with Repository Pattern - MethodConsultService with DecisionTable + LLM enhancement - AskUserService with global interrupt handling - Frontend AskUserCard component - E2E tests: 13/13 passed Phase IV - Dialogue-Driven Analysis + QPER Integration: - ToolOrchestratorService (plan/execute/report) - analysis_plan SSE event for WorkflowPlan transmission - Dual-channel confirmation (ask_user card + workspace button) - PICO as optional hint for LLM parsing - E2E tests: 25/25 passed R Statistics Service: - 5 new R tools: anova_one, baseline_table, fisher, linear_reg, wilcoxon - Enhanced guardrails and block helpers - Comprehensive test suite (run_all_tools_test.js) Documentation: - Updated system status document (v5.9) - Updated SSA module status and development plan (v1.8) Total E2E: 107/107 passed (Phase I: 31, Phase II: 38, Phase III: 13, Phase IV: 25) Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
204
backend/src/modules/ssa/services/TokenTruncationService.ts
Normal file
204
backend/src/modules/ssa/services/TokenTruncationService.ts
Normal file
@@ -0,0 +1,204 @@
|
||||
/**
|
||||
* Phase I — Token 截断服务
|
||||
*
|
||||
* 在将 SessionBlackboard 数据注入 LLM Prompt 之前,
|
||||
* 按优先级策略裁剪 payload 以适配模型上下文窗口。
|
||||
*
|
||||
* 裁剪策略(按优先级从低到高保留):
|
||||
* 1. 完整变量字典 → 仅保留非 isIdLike 的变量
|
||||
* 2. topValues 列表 → 截断到 top 5
|
||||
* 3. 数值列详细统计 → 保留 mean/std/median + 去掉 skewness/kurtosis
|
||||
* 4. normalityTests → 仅保留非正态的变量
|
||||
* 5. picoInference → 始终保留(最高优先级)
|
||||
* 6. fiveSectionReport.content → 若超限则截断到前 500 字符
|
||||
*
|
||||
* 预估 token 使用简易方式: 1 中文字 ≈ 2 tokens, 1 英文词 ≈ 1.3 tokens
|
||||
* 通过 JSON.stringify 长度 / 2 作为粗略上界。
|
||||
*/
|
||||
|
||||
import { logger } from '../../../common/logging/index.js';
|
||||
import type {
|
||||
SessionBlackboard,
|
||||
DataOverview,
|
||||
VariableDictEntry,
|
||||
FiveSectionReport,
|
||||
} from '../types/session-blackboard.types.js';
|
||||
|
||||
export interface TruncationOptions {
|
||||
maxTokens?: number;
|
||||
strategy?: 'aggressive' | 'balanced' | 'minimal';
|
||||
}
|
||||
|
||||
interface TruncatedContext {
|
||||
overview: string;
|
||||
variables: string;
|
||||
pico: string;
|
||||
report: string;
|
||||
estimatedTokens: number;
|
||||
}
|
||||
|
||||
const DEFAULT_MAX_TOKENS = 3000;
|
||||
|
||||
export class TokenTruncationService {
|
||||
|
||||
/**
|
||||
* 将 SessionBlackboard 截断为可注入 Prompt 的紧凑文本。
|
||||
*/
|
||||
truncate(
|
||||
blackboard: SessionBlackboard,
|
||||
options: TruncationOptions = {},
|
||||
): TruncatedContext {
|
||||
const maxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS;
|
||||
const strategy = options.strategy ?? 'balanced';
|
||||
|
||||
logger.debug('[SSA:TokenTrunc] Truncating context', {
|
||||
sessionId: blackboard.sessionId,
|
||||
maxTokens,
|
||||
strategy,
|
||||
});
|
||||
|
||||
const pico = this.formatPico(blackboard);
|
||||
const overview = this.formatOverview(blackboard.dataOverview, strategy);
|
||||
const variables = this.formatVariables(blackboard.variableDictionary, strategy);
|
||||
const report = this.formatReport(blackboard, strategy);
|
||||
|
||||
let ctx: TruncatedContext = {
|
||||
pico,
|
||||
overview,
|
||||
variables,
|
||||
report,
|
||||
estimatedTokens: 0,
|
||||
};
|
||||
|
||||
ctx.estimatedTokens = this.estimateTokens(ctx);
|
||||
|
||||
if (ctx.estimatedTokens > maxTokens) {
|
||||
ctx = this.applyAggressiveTruncation(ctx, blackboard, maxTokens);
|
||||
}
|
||||
|
||||
logger.debug('[SSA:TokenTrunc] Truncation complete', {
|
||||
estimatedTokens: ctx.estimatedTokens,
|
||||
maxTokens,
|
||||
});
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
/**
|
||||
* 一次性生成可直接拼入 system prompt 的字符串。
|
||||
*/
|
||||
toPromptString(ctx: TruncatedContext): string {
|
||||
const parts: string[] = [];
|
||||
|
||||
if (ctx.pico) parts.push(`## PICO 结构\n${ctx.pico}`);
|
||||
if (ctx.overview) parts.push(`## 数据概览\n${ctx.overview}`);
|
||||
if (ctx.variables) parts.push(`## 变量列表\n${ctx.variables}`);
|
||||
if (ctx.report) parts.push(`## 数据诊断摘要\n${ctx.report}`);
|
||||
|
||||
return parts.join('\n\n');
|
||||
}
|
||||
|
||||
private formatPico(bb: SessionBlackboard): string {
|
||||
const p = bb.picoInference;
|
||||
if (!p) return '';
|
||||
const lines = [];
|
||||
if (p.population) lines.push(`P (人群): ${p.population}`);
|
||||
if (p.intervention) lines.push(`I (干预): ${p.intervention}`);
|
||||
if (p.comparison) lines.push(`C (对照): ${p.comparison}`);
|
||||
if (p.outcome) lines.push(`O (结局): ${p.outcome}`);
|
||||
return lines.join('\n');
|
||||
}
|
||||
|
||||
private formatOverview(ov: DataOverview | null, strategy: string): string {
|
||||
if (!ov) return '';
|
||||
const s = ov.profile.summary;
|
||||
let text = `${s.totalRows} 行 × ${s.totalColumns} 列, 缺失率 ${s.overallMissingRate}%, 完整病例 ${ov.completeCaseCount}`;
|
||||
|
||||
if (strategy !== 'aggressive' && ov.normalityTests?.length) {
|
||||
const nonNormal = ov.normalityTests.filter(t => !t.isNormal).map(t => t.variable);
|
||||
if (nonNormal.length > 0) {
|
||||
text += `\n非正态: ${nonNormal.join(', ')}`;
|
||||
}
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
private formatVariables(dict: VariableDictEntry[], strategy: string): string {
|
||||
let vars = dict.filter(v => !v.isIdLike);
|
||||
|
||||
if (strategy === 'aggressive') {
|
||||
vars = vars.slice(0, 15);
|
||||
}
|
||||
|
||||
return vars.map(v => {
|
||||
const type = v.confirmedType ?? v.inferredType;
|
||||
const label = v.label ? ` "${v.label}"` : '';
|
||||
const role = v.picoRole ? ` [${v.picoRole}]` : '';
|
||||
return `- ${v.name}: ${type}${label}${role}`;
|
||||
}).join('\n');
|
||||
}
|
||||
|
||||
private formatReport(bb: SessionBlackboard, strategy: string): string {
|
||||
const report = bb.dataOverview
|
||||
? this.buildReportSummary(bb.dataOverview)
|
||||
: '';
|
||||
|
||||
if (strategy === 'aggressive' && report.length > 500) {
|
||||
return report.slice(0, 500) + '...';
|
||||
}
|
||||
return report;
|
||||
}
|
||||
|
||||
private buildReportSummary(ov: DataOverview): string {
|
||||
const s = ov.profile.summary;
|
||||
const lines: string[] = [];
|
||||
|
||||
const missingCols = ov.profile.columns.filter(c => c.missingCount > 0);
|
||||
if (missingCols.length > 0) {
|
||||
lines.push(`缺失变量(${missingCols.length}): ${missingCols.map(c => c.name).join(', ')}`);
|
||||
}
|
||||
|
||||
const outlierCols = ov.profile.columns.filter(c => (c as any).outlierCount > 0);
|
||||
if (outlierCols.length > 0) {
|
||||
lines.push(`异常值变量(${outlierCols.length}): ${outlierCols.map(c => c.name).join(', ')}`);
|
||||
}
|
||||
|
||||
const catCount = s.categoricalColumns;
|
||||
const numCount = s.numericColumns;
|
||||
lines.push(`类型: 数值${numCount} + 分类${catCount}`);
|
||||
|
||||
return lines.join('\n');
|
||||
}
|
||||
|
||||
private estimateTokens(ctx: TruncatedContext): number {
|
||||
const total = ctx.pico.length + ctx.overview.length + ctx.variables.length + ctx.report.length;
|
||||
return Math.ceil(total / 2);
|
||||
}
|
||||
|
||||
private applyAggressiveTruncation(
|
||||
ctx: TruncatedContext,
|
||||
bb: SessionBlackboard,
|
||||
maxTokens: number,
|
||||
): TruncatedContext {
|
||||
const result = { ...ctx };
|
||||
|
||||
result.report = result.report.length > 300 ? result.report.slice(0, 300) + '...' : result.report;
|
||||
|
||||
let vars = bb.variableDictionary.filter(v => !v.isIdLike);
|
||||
if (vars.length > 10) {
|
||||
const picoVars = vars.filter(v => v.picoRole);
|
||||
const others = vars.filter(v => !v.picoRole).slice(0, 10 - picoVars.length);
|
||||
vars = [...picoVars, ...others];
|
||||
}
|
||||
result.variables = vars.map(v => {
|
||||
const type = v.confirmedType ?? v.inferredType;
|
||||
return `- ${v.name}: ${type}`;
|
||||
}).join('\n');
|
||||
|
||||
result.estimatedTokens = this.estimateTokens(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
export const tokenTruncationService = new TokenTruncationService();
|
||||
Reference in New Issue
Block a user