AIclinicalresearch/backend/src/modules/ssa/services/TokenTruncationService.ts

/**
 * Phase I — Token 截断服务
 *
 * 在将 SessionBlackboard 数据注入 LLM Prompt 之前，
 * 按优先级策略裁剪 payload 以适配模型上下文窗口。
 *
 * 裁剪策略（按优先级从低到高保留）：
 *   1. 完整变量字典 → 仅保留非 isIdLike 的变量
 *   2. topValues 列表 → 截断到 top 5
 *   3. 数值列详细统计 → 保留 mean/std/median + 去掉 skewness/kurtosis
 *   4. normalityTests → 仅保留非正态的变量
 *   5. picoInference → 始终保留（最高优先级）
 *   6. fiveSectionReport.content → 若超限则截断到前 500 字符
 *
 * 预估 token 使用简易方式: 1 中文字 ≈ 2 tokens, 1 英文词 ≈ 1.3 tokens
 * 通过 JSON.stringify 长度 / 2 作为粗略上界。
 */

import { logger } from '../../../common/logging/index.js';
import type {
  SessionBlackboard,
  DataOverview,
  VariableDictEntry,
  FiveSectionReport,
} from '../types/session-blackboard.types.js';

export interface TruncationOptions {
  maxTokens?: number;
  strategy?: 'aggressive' | 'balanced' | 'minimal';
}

interface TruncatedContext {
  overview: string;
  variables: string;
  pico: string;
  report: string;
  estimatedTokens: number;
}

const DEFAULT_MAX_TOKENS = 3000;

export class TokenTruncationService {

  /**
   * 将 SessionBlackboard 截断为可注入 Prompt 的紧凑文本。
   */
  truncate(
    blackboard: SessionBlackboard,
    options: TruncationOptions = {},
  ): TruncatedContext {
    const maxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS;
    const strategy = options.strategy ?? 'balanced';

    logger.debug('[SSA:TokenTrunc] Truncating context', {
      sessionId: blackboard.sessionId,
      maxTokens,
      strategy,
    });

    const pico = this.formatPico(blackboard);
    const overview = this.formatOverview(blackboard.dataOverview, strategy);
    const variables = this.formatVariables(blackboard.variableDictionary, strategy);
    const report = this.formatReport(blackboard, strategy);

    let ctx: TruncatedContext = {
      pico,
      overview,
      variables,
      report,
      estimatedTokens: 0,
    };

    ctx.estimatedTokens = this.estimateTokens(ctx);

    if (ctx.estimatedTokens > maxTokens) {
      ctx = this.applyAggressiveTruncation(ctx, blackboard, maxTokens);
    }

    logger.debug('[SSA:TokenTrunc] Truncation complete', {
      estimatedTokens: ctx.estimatedTokens,
      maxTokens,
    });

    return ctx;
  }

  /**
   * 一次性生成可直接拼入 system prompt 的字符串。
   */
  toPromptString(ctx: TruncatedContext): string {
    const parts: string[] = [];

    if (ctx.pico) parts.push(`## PICO 结构\n${ctx.pico}`);
    if (ctx.overview) parts.push(`## 数据概览\n${ctx.overview}`);
    if (ctx.variables) parts.push(`## 变量列表\n${ctx.variables}`);
    if (ctx.report) parts.push(`## 数据诊断摘要\n${ctx.report}`);

    return parts.join('\n\n');
  }

  private formatPico(bb: SessionBlackboard): string {
    const p = bb.picoInference;
    if (!p) return '';
    const lines = [];
    if (p.population) lines.push(`P (人群): ${p.population}`);
    if (p.intervention) lines.push(`I (干预): ${p.intervention}`);
    if (p.comparison) lines.push(`C (对照): ${p.comparison}`);
    if (p.outcome) lines.push(`O (结局): ${p.outcome}`);
    return lines.join('\n');
  }

  private formatOverview(ov: DataOverview | null, strategy: string): string {
    if (!ov) return '';
    const s = ov.profile.summary;
    let text = `${s.totalRows} 行 × ${s.totalColumns} 列, 缺失率 ${s.overallMissingRate}%, 完整病例 ${ov.completeCaseCount}`;

    if (strategy !== 'aggressive' && ov.normalityTests?.length) {
      const nonNormal = ov.normalityTests.filter(t => !t.isNormal).map(t => t.variable);
      if (nonNormal.length > 0) {
        text += `\n非正态: ${nonNormal.join(', ')}`;
      }
    }

    return text;
  }

  private formatVariables(dict: VariableDictEntry[], strategy: string): string {
    let vars = dict.filter(v => !v.isIdLike);

    if (strategy === 'aggressive') {
      vars = vars.slice(0, 15);
    }

    return vars.map(v => {
      const type = v.confirmedType ?? v.inferredType;
      const label = v.label ? ` "${v.label}"` : '';
      const role = v.picoRole ? ` [${v.picoRole}]` : '';
      return `- ${v.name}: ${type}${label}${role}`;
    }).join('\n');
  }

  private formatReport(bb: SessionBlackboard, strategy: string): string {
    const report = bb.dataOverview
      ? this.buildReportSummary(bb.dataOverview)
      : '';

    if (strategy === 'aggressive' && report.length > 500) {
      return report.slice(0, 500) + '...';
    }
    return report;
  }

  private buildReportSummary(ov: DataOverview): string {
    const s = ov.profile.summary;
    const lines: string[] = [];

    const missingCols = ov.profile.columns.filter(c => c.missingCount > 0);
    if (missingCols.length > 0) {
      lines.push(`缺失变量(${missingCols.length}): ${missingCols.map(c => c.name).join(', ')}`);
    }

    const outlierCols = ov.profile.columns.filter(c => (c as any).outlierCount > 0);
    if (outlierCols.length > 0) {
      lines.push(`异常值变量(${outlierCols.length}): ${outlierCols.map(c => c.name).join(', ')}`);
    }

    const catCount = s.categoricalColumns;
    const numCount = s.numericColumns;
    lines.push(`类型: 数值${numCount} + 分类${catCount}`);

    return lines.join('\n');
  }

  private estimateTokens(ctx: TruncatedContext): number {
    const total = ctx.pico.length + ctx.overview.length + ctx.variables.length + ctx.report.length;
    return Math.ceil(total / 2);
  }

  private applyAggressiveTruncation(
    ctx: TruncatedContext,
    bb: SessionBlackboard,
    maxTokens: number,
  ): TruncatedContext {
    const result = { ...ctx };

    result.report = result.report.length > 300 ? result.report.slice(0, 300) + '...' : result.report;

    let vars = bb.variableDictionary.filter(v => !v.isIdLike);
    if (vars.length > 10) {
      const picoVars = vars.filter(v => v.picoRole);
      const others = vars.filter(v => !v.picoRole).slice(0, 10 - picoVars.length);
      vars = [...picoVars, ...others];
    }
    result.variables = vars.map(v => {
      const type = v.confirmedType ?? v.inferredType;
      return `- ${v.name}: ${type}`;
    }).join('\n');

    result.estimatedTokens = this.estimateTokens(result);
    return result;
  }
}

export const tokenTruncationService = new TokenTruncationService();