feat(platform): Complete Postgres-Only architecture refactoring (Phase 1-7)

Major Changes: - Implement Platform-Only architecture pattern (unified task management) - Add PostgresCacheAdapter for unified caching (platform_schema.app_cache) - Add PgBossQueue for job queue management (platform_schema.job) - Implement CheckpointService using job.data (generic for all modules) - Add intelligent threshold-based dual-mode processing (THRESHOLD=50) - Add task splitting mechanism (auto chunk size recommendation) - Refactor ASL screening service with smart mode selection - Refactor DC extraction service with smart mode selection - Register workers for ASL and DC modules Technical Highlights: - All task management data stored in platform_schema.job.data (JSONB) - Business tables remain clean (no task management fields) - CheckpointService is generic (shared by all modules) - Zero code duplication (DRY principle) - Follows 3-layer architecture principle - Zero additional cost (no Redis needed, save 8400 CNY/year) Code Statistics: - New code: ~1750 lines - Modified code: ~500 lines - Test code: ~1800 lines - Documentation: ~3000 lines Testing: - Unit tests: 8/8 passed - Integration tests: 2/2 passed - Architecture validation: passed - Linter errors: 0 Files: - Platform layer: PostgresCacheAdapter, PgBossQueue, CheckpointService, utils - ASL module: screeningService, screeningWorker - DC module: ExtractionController, extractionWorker - Tests: 11 test files - Docs: Updated 4 key documents Status: Phase 1-7 completed, Phase 8-9 pending
2025-12-13 16:10:04 +08:00
parent a3586cdf30
commit fa72beea6c
135 changed files with 17508 additions and 91 deletions
--- a/backend/src/common/jobs/utils.ts
+++ b/backend/src/common/jobs/utils.ts
@@ -0,0 +1,282 @@
+/**
+ * 任务拆分工具函数
+ * 
+ * 用于将长时间任务拆分成多个小任务，避免：
+ * - SAE 30秒超时
+ * - pg-boss 24小时任务过期
+ * - 任务失败时重做所有工作
+ * 
+ * 核心策略：
+ * - 文献筛选：每批20-50篇
+ * - 数据提取：每批10-20条
+ * - 统计分析：按数据集大小动态调整
+ */
+
+/**
+ * 任务类型的拆分策略
+ */
+export interface ChunkStrategy {
+  /** 任务类型标识 */
+  type: string
+  
+  /** 每批处理的数据量 */
+  chunkSize: number
+  
+  /** 最大批次数（防止过度拆分） */
+  maxChunks?: number
+  
+  /** 描述 */
+  description: string
+}
+
+/**
+ * 预定义的拆分策略
+ * 
+ * 根据实际业务场景和性能测试数据配置
+ */
+export const CHUNK_STRATEGIES: Record<string, ChunkStrategy> = {
+  // ASL模块：文献筛选
+  'asl:screening:title-abstract': {
+    type: 'asl:screening:title-abstract',
+    chunkSize: 50, // 每批50篇（LLM API较快）
+    maxChunks: 100, // 最多100批（5000篇）
+    description: '标题/摘要筛选 - 每批50篇'
+  },
+
+  'asl:screening:full-text': {
+    type: 'asl:screening:full-text',
+    chunkSize: 20, // 每批20篇（全文较慢）
+    maxChunks: 50, // 最多50批（1000篇）
+    description: '全文筛选 - 每批20篇'
+  },
+
+  'asl:extraction': {
+    type: 'asl:extraction',
+    chunkSize: 30, // 每批30篇
+    maxChunks: 50,
+    description: '数据提取 - 每批30篇'
+  },
+
+  // DC模块：数据清洗
+  'dc:clean:batch': {
+    type: 'dc:clean:batch',
+    chunkSize: 100, // 每批100行
+    maxChunks: 100,
+    description: '数据清洗 - 每批100行'
+  },
+
+  'dc:extract:medical-record': {
+    type: 'dc:extract:medical-record',
+    chunkSize: 10, // 每批10份病历（AI提取较慢）
+    maxChunks: 100,
+    description: '病历提取 - 每批10份'
+  },
+
+  // SSA模块：统计分析
+  'ssa:analysis:batch': {
+    type: 'ssa:analysis:batch',
+    chunkSize: 1000, // 每批1000条数据
+    maxChunks: 50,
+    description: '统计分析 - 每批1000条'
+  },
+
+  // 默认策略
+  'default': {
+    type: 'default',
+    chunkSize: 50,
+    maxChunks: 100,
+    description: '默认策略 - 每批50条'
+  }
+}
+
+/**
+ * 将数据数组拆分成多个批次
+ * 
+ * @param items 要拆分的数据数组
+ * @param chunkSize 每批的大小
+ * @returns 拆分后的批次数组
+ * 
+ * @example
+ * ```typescript
+ * const ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+ * const batches = splitIntoChunks(ids, 3)
+ * // 结果: [[1,2,3], [4,5,6], [7,8,9], [10]]
+ * ```
+ */
+export function splitIntoChunks<T>(items: T[], chunkSize: number): T[][] {
+  if (chunkSize <= 0) {
+    throw new Error('chunkSize must be positive')
+  }
+
+  if (items.length === 0) {
+    return []
+  }
+
+  const chunks: T[][] = []
+  
+  for (let i = 0; i < items.length; i += chunkSize) {
+    chunks.push(items.slice(i, i + chunkSize))
+  }
+
+  return chunks
+}
+
+/**
+ * 根据任务类型推荐批次大小
+ * 
+ * @param taskType 任务类型（如：'asl:screening:title-abstract'）
+ * @param totalItems 总数据量
+ * @returns 推荐的批次大小
+ * 
+ * @example
+ * ```typescript
+ * const chunkSize = recommendChunkSize('asl:screening:title-abstract', 1000)
+ * // 返回: 50 (根据CHUNK_STRATEGIES配置)
+ * ```
+ */
+export function recommendChunkSize(taskType: string, totalItems: number): number {
+  // 查找对应的策略
+  const strategy = CHUNK_STRATEGIES[taskType] || CHUNK_STRATEGIES['default']
+
+  let chunkSize = strategy.chunkSize
+
+  // 如果总量很小，不拆分
+  if (totalItems <= chunkSize) {
+    return totalItems
+  }
+
+  // 如果拆分后批次数超过maxChunks，增大chunkSize
+  if (strategy.maxChunks) {
+    const predictedChunks = Math.ceil(totalItems / chunkSize)
+    if (predictedChunks > strategy.maxChunks) {
+      chunkSize = Math.ceil(totalItems / strategy.maxChunks)
+      console.log(
+        `[TaskSplit] Adjusted chunkSize to ${chunkSize} to limit chunks to ${strategy.maxChunks}`
+      )
+    }
+  }
+
+  return chunkSize
+}
+
+/**
+ * 计算任务拆分信息
+ * 
+ * @param taskType 任务类型
+ * @param totalItems 总数据量
+ * @returns 拆分信息
+ * 
+ * @example
+ * ```typescript
+ * const info = calculateSplitInfo('asl:screening:title-abstract', 1000)
+ * // 返回: { chunkSize: 50, totalChunks: 20, strategy: {...} }
+ * ```
+ */
+export function calculateSplitInfo(taskType: string, totalItems: number) {
+  const strategy = CHUNK_STRATEGIES[taskType] || CHUNK_STRATEGIES['default']
+  const chunkSize = recommendChunkSize(taskType, totalItems)
+  const totalChunks = Math.ceil(totalItems / chunkSize)
+
+  return {
+    taskType,
+    totalItems,
+    chunkSize,
+    totalChunks,
+    strategy,
+    avgItemsPerChunk: totalChunks > 0 ? Math.round(totalItems / totalChunks) : 0,
+    lastChunkSize: totalItems % chunkSize || chunkSize
+  }
+}
+
+/**
+ * 获取批次索引的人类可读描述
+ * 
+ * @param batchIndex 批次索引（从0开始）
+ * @param totalBatches 总批次数
+ * @returns 描述字符串
+ * 
+ * @example
+ * ```typescript
+ * getBatchDescription(0, 20) // "批次 1/20"
+ * getBatchDescription(19, 20) // "批次 20/20（最后一批）"
+ * ```
+ */
+export function getBatchDescription(batchIndex: number, totalBatches: number): string {
+  const humanIndex = batchIndex + 1
+  
+  if (humanIndex === totalBatches) {
+    return `批次 ${humanIndex}/${totalBatches}（最后一批）`
+  }
+  
+  return `批次 ${humanIndex}/${totalBatches}`
+}
+
+/**
+ * 估算批次执行时间（秒）
+ * 
+ * 基于经验值估算，用于前端显示预计完成时间
+ * 
+ * @param taskType 任务类型
+ * @param batchSize 批次大小
+ * @returns 估算的执行时间（秒）
+ */
+export function estimateBatchDuration(taskType: string, batchSize: number): number {
+  // 每项平均处理时间（秒）
+  const TIME_PER_ITEM: Record<string, number> = {
+    'asl:screening:title-abstract': 0.5, // 0.5秒/篇（含LLM调用）
+    'asl:screening:full-text': 2, // 2秒/篇
+    'asl:extraction': 3, // 3秒/篇
+    'dc:clean:batch': 0.1, // 0.1秒/行
+    'dc:extract:medical-record': 5, // 5秒/份
+    'ssa:analysis:batch': 0.01, // 0.01秒/条
+    'default': 1 // 1秒/条
+  }
+
+  const timePerItem = TIME_PER_ITEM[taskType] || TIME_PER_ITEM['default']
+  
+  return Math.ceil(batchSize * timePerItem)
+}
+
+/**
+ * 验证批次索引是否有效
+ * 
+ * @param batchIndex 批次索引
+ * @param totalBatches 总批次数
+ * @throws Error 如果索引无效
+ */
+export function validateBatchIndex(batchIndex: number, totalBatches: number): void {
+  if (batchIndex < 0 || batchIndex >= totalBatches) {
+    throw new Error(
+      `Invalid batch index: ${batchIndex}. Must be between 0 and ${totalBatches - 1}`
+    )
+  }
+}
+
+/**
+ * 从数组中提取指定批次的数据
+ * 
+ * @param items 完整数据数组
+ * @param batchIndex 批次索引（从0开始）
+ * @param chunkSize 批次大小
+ * @returns 该批次的数据
+ * 
+ * @example
+ * ```typescript
+ * const ids = [1,2,3,4,5,6,7,8,9,10]
+ * getBatchItems(ids, 0, 3) // [1,2,3]
+ * getBatchItems(ids, 1, 3) // [4,5,6]
+ * getBatchItems(ids, 3, 3) // [10]
+ * ```
+ */
+export function getBatchItems<T>(
+  items: T[],
+  batchIndex: number,
+  chunkSize: number
+): T[] {
+  const start = batchIndex * chunkSize
+  const end = Math.min(start + chunkSize, items.length)
+  
+  return items.slice(start, end)
+}
+
+