refactor(backend): incremental architecture evolution (Task 19)

- Add common/ layer for shared capabilities (LLM, RAG, document, middleware) - Add legacy/ layer for existing business code - Move files to new structure (controllers, routes, services) - Update index.ts for new route registration - System remains fully functional
2025-11-16 15:42:44 +08:00
parent 8a17dc80ae
commit 0c5310fb77
39 changed files with 3904 additions and 353 deletions
--- a/backend/src/legacy/services/tokenService.ts
+++ b/backend/src/legacy/services/tokenService.ts
@@ -0,0 +1,232 @@
+import { encoding_for_model, Tiktoken } from 'tiktoken';
+
+/**
+ * Token计数服务
+ * 用于全文阅读模式的Token管理
+ */
+
+// Token限制配置
+export const TOKEN_LIMITS = {
+  MAX_FILES: 50,              // 最多50个文件
+  MAX_TOTAL_TOKENS: 980000,   // 最多980K tokens（为Qwen-Long 1M上下文留20K余量）
+  CONTEXT_RESERVE: 20000,     // 预留给系统提示词和用户查询的token
+};
+
+// 缓存编码器
+let encoderCache: Tiktoken | null = null;
+
+/**
+ * 获取编码器（使用gpt-4作为Qwen的替代）
+ */
+function getEncoder(): Tiktoken {
+  if (!encoderCache) {
+    // Qwen使用类似GPT-4的tokenizer
+    encoderCache = encoding_for_model('gpt-4');
+  }
+  return encoderCache;
+}
+
+/**
+ * 计算文本的Token数
+ */
+export function countTokens(text: string): number {
+  if (!text || text.trim().length === 0) {
+    return 0;
+  }
+
+  try {
+    const encoder = getEncoder();
+    const tokens = encoder.encode(text);
+    return tokens.length;
+  } catch (error) {
+    console.error('[TokenService] Failed to count tokens:', error);
+    // 降级：粗略估算（中文约1.5字符/token，英文约4字符/token）
+    const chineseChars = (text.match(/[\u4e00-\u9fff]/g) || []).length;
+    const totalChars = text.length;
+    const englishChars = totalChars - chineseChars;
+    
+    return Math.ceil(chineseChars / 1.5 + englishChars / 4);
+  }
+}
+
+/**
+ * 批量计算多个文本的Token数
+ */
+export function countTokensBatch(texts: string[]): number[] {
+  return texts.map(text => countTokens(text));
+}
+
+/**
+ * 计算文档Token数（基于提取的文本）
+ */
+export interface DocumentTokenInfo {
+  documentId: string;
+  filename: string;
+  charCount: number;
+  estimatedTokens: number;
+  extractionMethod?: string;
+}
+
+/**
+ * 为文档列表计算Token数
+ */
+export function calculateDocumentTokens(
+  documents: Array<{
+    id: string;
+    filename: string;
+    extractedText?: string | null;
+    charCount?: number | null;
+    extractionMethod?: string | null;
+  }>
+): DocumentTokenInfo[] {
+  return documents.map(doc => {
+    let estimatedTokens = 0;
+    
+    if (doc.extractedText) {
+      // 使用提取的文本计算精确token数
+      estimatedTokens = countTokens(doc.extractedText);
+    } else if (doc.charCount) {
+      // 如果没有提取文本，使用字符数估算
+      // 假设中英文混合，平均2.5字符/token
+      estimatedTokens = Math.ceil(doc.charCount / 2.5);
+    }
+
+    return {
+      documentId: doc.id,
+      filename: doc.filename,
+      charCount: doc.charCount || 0,
+      estimatedTokens,
+      extractionMethod: doc.extractionMethod || undefined,
+    };
+  });
+}
+
+/**
+ * 选择文档以满足Token限制
+ * 策略：优先选择Token数少的文档，直到达到限制
+ */
+export interface DocumentSelectionResult {
+  selectedDocuments: DocumentTokenInfo[];
+  totalTokens: number;
+  totalFiles: number;
+  excludedDocuments: DocumentTokenInfo[];
+  reason: 'all_included' | 'file_limit' | 'token_limit';
+  availableTokens: number;
+}
+
+export function selectDocumentsForFullText(
+  documents: DocumentTokenInfo[],
+  maxFiles: number = TOKEN_LIMITS.MAX_FILES,
+  maxTokens: number = TOKEN_LIMITS.MAX_TOTAL_TOKENS
+): DocumentSelectionResult {
+  // 按Token数升序排序（优先选择小文件）
+  const sortedDocs = [...documents].sort(
+    (a, b) => a.estimatedTokens - b.estimatedTokens
+  );
+
+  const selected: DocumentTokenInfo[] = [];
+  const excluded: DocumentTokenInfo[] = [];
+  let totalTokens = 0;
+
+  for (const doc of sortedDocs) {
+    // 检查文件数限制
+    if (selected.length >= maxFiles) {
+      excluded.push(doc);
+      continue;
+    }
+
+    // 检查Token限制
+    if (totalTokens + doc.estimatedTokens > maxTokens) {
+      excluded.push(doc);
+      continue;
+    }
+
+    // 添加到选中列表
+    selected.push(doc);
+    totalTokens += doc.estimatedTokens;
+  }
+
+  // 判断限制原因
+  let reason: 'all_included' | 'file_limit' | 'token_limit' = 'all_included';
+  if (excluded.length > 0) {
+    if (selected.length >= maxFiles) {
+      reason = 'file_limit';
+    } else {
+      reason = 'token_limit';
+    }
+  }
+
+  return {
+    selectedDocuments: selected,
+    totalTokens,
+    totalFiles: selected.length,
+    excludedDocuments: excluded,
+    reason,
+    availableTokens: maxTokens - totalTokens,
+  };
+}
+
+/**
+ * 估算查询需要的Token数
+ */
+export function estimateQueryTokens(query: string, systemPrompt?: string): number {
+  let total = countTokens(query);
+  
+  if (systemPrompt) {
+    total += countTokens(systemPrompt);
+  }
+  
+  // 为响应预留空间
+  total += 2000; // 假设响应最多2000 tokens
+  
+  return total;
+}
+
+/**
+ * 检查是否超过Token限制
+ */
+export function checkTokenLimit(
+  documentsTokens: number,
+  queryTokens: number,
+  maxTokens: number = TOKEN_LIMITS.MAX_TOTAL_TOKENS
+): {
+  withinLimit: boolean;
+  totalTokens: number;
+  maxTokens: number;
+  remaining: number;
+} {
+  const totalTokens = documentsTokens + queryTokens;
+  const remaining = maxTokens - totalTokens;
+
+  return {
+    withinLimit: remaining >= 0,
+    totalTokens,
+    maxTokens,
+    remaining,
+  };
+}
+
+/**
+ * 释放编码器（清理资源）
+ */
+export function cleanup() {
+  if (encoderCache) {
+    encoderCache.free();
+    encoderCache = null;
+  }
+}
+
+// 进程退出时清理
+if (typeof process !== 'undefined') {
+  process.on('exit', cleanup);
+  process.on('SIGINT', () => {
+    cleanup();
+    process.exit();
+  });
+}
+
+
+
+
+
+