AIclinicalresearch/backend/src/modules/pkb/services/tokenService.ts

import { encoding_for_model, Tiktoken } from 'tiktoken';

/**
 * Token计数服务
 * 用于全文阅读模式的Token管理
 */

// Token限制配置
export const TOKEN_LIMITS = {
  MAX_FILES: 50,              // 最多50个文件
  MAX_TOTAL_TOKENS: 980000,   // 最多980K tokens（为Qwen-Long 1M上下文留20K余量）
  CONTEXT_RESERVE: 20000,     // 预留给系统提示词和用户查询的token
};

// 缓存编码器
let encoderCache: Tiktoken | null = null;

/**
 * 获取编码器（使用gpt-4作为Qwen的替代）
 */
function getEncoder(): Tiktoken {
  if (!encoderCache) {
    // Qwen使用类似GPT-4的tokenizer
    encoderCache = encoding_for_model('gpt-4');
  }
  return encoderCache;
}

/**
 * 计算文本的Token数
 */
export function countTokens(text: string): number {
  if (!text || text.trim().length === 0) {
    return 0;
  }

  try {
    const encoder = getEncoder();
    const tokens = encoder.encode(text);
    return tokens.length;
  } catch (error) {
    console.error('[TokenService] Failed to count tokens:', error);
    // 降级：粗略估算（中文约1.5字符/token，英文约4字符/token）
    const chineseChars = (text.match(/[\u4e00-\u9fff]/g) || []).length;
    const totalChars = text.length;
    const englishChars = totalChars - chineseChars;

    return Math.ceil(chineseChars / 1.5 + englishChars / 4);
  }
}

/**
 * 批量计算多个文本的Token数
 */
export function countTokensBatch(texts: string[]): number[] {
  return texts.map(text => countTokens(text));
}

/**
 * 计算文档Token数（基于提取的文本）
 */
export interface DocumentTokenInfo {
  documentId: string;
  filename: string;
  charCount: number;
  estimatedTokens: number;
  extractionMethod?: string;
}

/**
 * 为文档列表计算Token数
 */
export function calculateDocumentTokens(
  documents: Array<{
    id: string;
    filename: string;
    extractedText?: string | null;
    charCount?: number | null;
    extractionMethod?: string | null;
  }>
): DocumentTokenInfo[] {
  return documents.map(doc => {
    let estimatedTokens = 0;

    if (doc.extractedText) {
      // 使用提取的文本计算精确token数
      estimatedTokens = countTokens(doc.extractedText);
    } else if (doc.charCount) {
      // 如果没有提取文本，使用字符数估算
      // 假设中英文混合，平均2.5字符/token
      estimatedTokens = Math.ceil(doc.charCount / 2.5);
    }

    return {
      documentId: doc.id,
      filename: doc.filename,
      charCount: doc.charCount || 0,
      estimatedTokens,
      extractionMethod: doc.extractionMethod || undefined,
    };
  });
}

/**
 * 选择文档以满足Token限制
 * 策略：优先选择Token数少的文档，直到达到限制
 */
export interface DocumentSelectionResult {
  selectedDocuments: DocumentTokenInfo[];
  totalTokens: number;
  totalFiles: number;
  excludedDocuments: DocumentTokenInfo[];
  reason: 'all_included' | 'file_limit' | 'token_limit';
  availableTokens: number;
}

export function selectDocumentsForFullText(
  documents: DocumentTokenInfo[],
  maxFiles: number = TOKEN_LIMITS.MAX_FILES,
  maxTokens: number = TOKEN_LIMITS.MAX_TOTAL_TOKENS
): DocumentSelectionResult {
  // 按Token数升序排序（优先选择小文件）
  const sortedDocs = [...documents].sort(
    (a, b) => a.estimatedTokens - b.estimatedTokens
  );

  const selected: DocumentTokenInfo[] = [];
  const excluded: DocumentTokenInfo[] = [];
  let totalTokens = 0;

  for (const doc of sortedDocs) {
    // 检查文件数限制
    if (selected.length >= maxFiles) {
      excluded.push(doc);
      continue;
    }

    // 检查Token限制
    if (totalTokens + doc.estimatedTokens > maxTokens) {
      excluded.push(doc);
      continue;
    }

    // 添加到选中列表
    selected.push(doc);
    totalTokens += doc.estimatedTokens;
  }

  // 判断限制原因
  let reason: 'all_included' | 'file_limit' | 'token_limit' = 'all_included';
  if (excluded.length > 0) {
    if (selected.length >= maxFiles) {
      reason = 'file_limit';
    } else {
      reason = 'token_limit';
    }
  }

  return {
    selectedDocuments: selected,
    totalTokens,
    totalFiles: selected.length,
    excludedDocuments: excluded,
    reason,
    availableTokens: maxTokens - totalTokens,
  };
}

/**
 * 估算查询需要的Token数
 */
export function estimateQueryTokens(query: string, systemPrompt?: string): number {
  let total = countTokens(query);

  if (systemPrompt) {
    total += countTokens(systemPrompt);
  }

  // 为响应预留空间
  total += 2000; // 假设响应最多2000 tokens

  return total;
}

/**
 * 检查是否超过Token限制
 */
export function checkTokenLimit(
  documentsTokens: number,
  queryTokens: number,
  maxTokens: number = TOKEN_LIMITS.MAX_TOTAL_TOKENS
): {
  withinLimit: boolean;
  totalTokens: number;
  maxTokens: number;
  remaining: number;
} {
  const totalTokens = documentsTokens + queryTokens;
  const remaining = maxTokens - totalTokens;

  return {
    withinLimit: remaining >= 0,
    totalTokens,
    maxTokens,
    remaining,
  };
}

/**
 * 释放编码器（清理资源）
 */
export function cleanup() {
  if (encoderCache) {
    encoderCache.free();
    encoderCache = null;
  }
}

// 进程退出时清理
if (typeof process !== 'undefined') {
  process.on('exit', cleanup);
  process.on('SIGINT', () => {
    cleanup();
    process.exit();
  });
}