import { encoding_for_model, Tiktoken } from 'tiktoken'; /** * Token计数服务 * 用于全文阅读模式的Token管理 */ // Token限制配置 export const TOKEN_LIMITS = { MAX_FILES: 50, // 最多50个文件 MAX_TOTAL_TOKENS: 980000, // 最多980K tokens(为Qwen-Long 1M上下文留20K余量) CONTEXT_RESERVE: 20000, // 预留给系统提示词和用户查询的token }; // 缓存编码器 let encoderCache: Tiktoken | null = null; /** * 获取编码器(使用gpt-4作为Qwen的替代) */ function getEncoder(): Tiktoken { if (!encoderCache) { // Qwen使用类似GPT-4的tokenizer encoderCache = encoding_for_model('gpt-4'); } return encoderCache; } /** * 计算文本的Token数 */ export function countTokens(text: string): number { if (!text || text.trim().length === 0) { return 0; } try { const encoder = getEncoder(); const tokens = encoder.encode(text); return tokens.length; } catch (error) { console.error('[TokenService] Failed to count tokens:', error); // 降级:粗略估算(中文约1.5字符/token,英文约4字符/token) const chineseChars = (text.match(/[\u4e00-\u9fff]/g) || []).length; const totalChars = text.length; const englishChars = totalChars - chineseChars; return Math.ceil(chineseChars / 1.5 + englishChars / 4); } } /** * 批量计算多个文本的Token数 */ export function countTokensBatch(texts: string[]): number[] { return texts.map(text => countTokens(text)); } /** * 计算文档Token数(基于提取的文本) */ export interface DocumentTokenInfo { documentId: string; filename: string; charCount: number; estimatedTokens: number; extractionMethod?: string; } /** * 为文档列表计算Token数 */ export function calculateDocumentTokens( documents: Array<{ id: string; filename: string; extractedText?: string | null; charCount?: number | null; extractionMethod?: string | null; }> ): DocumentTokenInfo[] { return documents.map(doc => { let estimatedTokens = 0; if (doc.extractedText) { // 使用提取的文本计算精确token数 estimatedTokens = countTokens(doc.extractedText); } else if (doc.charCount) { // 如果没有提取文本,使用字符数估算 // 假设中英文混合,平均2.5字符/token estimatedTokens = Math.ceil(doc.charCount / 2.5); } return { documentId: doc.id, filename: doc.filename, charCount: doc.charCount || 0, estimatedTokens, extractionMethod: doc.extractionMethod || undefined, }; }); } /** * 选择文档以满足Token限制 * 策略:优先选择Token数少的文档,直到达到限制 */ export interface DocumentSelectionResult { selectedDocuments: DocumentTokenInfo[]; totalTokens: number; totalFiles: number; excludedDocuments: DocumentTokenInfo[]; reason: 'all_included' | 'file_limit' | 'token_limit'; availableTokens: number; } export function selectDocumentsForFullText( documents: DocumentTokenInfo[], maxFiles: number = TOKEN_LIMITS.MAX_FILES, maxTokens: number = TOKEN_LIMITS.MAX_TOTAL_TOKENS ): DocumentSelectionResult { // 按Token数升序排序(优先选择小文件) const sortedDocs = [...documents].sort( (a, b) => a.estimatedTokens - b.estimatedTokens ); const selected: DocumentTokenInfo[] = []; const excluded: DocumentTokenInfo[] = []; let totalTokens = 0; for (const doc of sortedDocs) { // 检查文件数限制 if (selected.length >= maxFiles) { excluded.push(doc); continue; } // 检查Token限制 if (totalTokens + doc.estimatedTokens > maxTokens) { excluded.push(doc); continue; } // 添加到选中列表 selected.push(doc); totalTokens += doc.estimatedTokens; } // 判断限制原因 let reason: 'all_included' | 'file_limit' | 'token_limit' = 'all_included'; if (excluded.length > 0) { if (selected.length >= maxFiles) { reason = 'file_limit'; } else { reason = 'token_limit'; } } return { selectedDocuments: selected, totalTokens, totalFiles: selected.length, excludedDocuments: excluded, reason, availableTokens: maxTokens - totalTokens, }; } /** * 估算查询需要的Token数 */ export function estimateQueryTokens(query: string, systemPrompt?: string): number { let total = countTokens(query); if (systemPrompt) { total += countTokens(systemPrompt); } // 为响应预留空间 total += 2000; // 假设响应最多2000 tokens return total; } /** * 检查是否超过Token限制 */ export function checkTokenLimit( documentsTokens: number, queryTokens: number, maxTokens: number = TOKEN_LIMITS.MAX_TOTAL_TOKENS ): { withinLimit: boolean; totalTokens: number; maxTokens: number; remaining: number; } { const totalTokens = documentsTokens + queryTokens; const remaining = maxTokens - totalTokens; return { withinLimit: remaining >= 0, totalTokens, maxTokens, remaining, }; } /** * 释放编码器(清理资源) */ export function cleanup() { if (encoderCache) { encoderCache.free(); encoderCache = null; } } // 进程退出时清理 if (typeof process !== 'undefined') { process.on('exit', cleanup); process.on('SIGINT', () => { cleanup(); process.exit(); }); }