AIclinicalresearch/backend/src/common/jobs/utils.ts

/**
 * 任务拆分工具函数
 *
 * 用于将长时间任务拆分成多个小任务，避免：
 * - SAE 30秒超时
 * - pg-boss 24小时任务过期
 * - 任务失败时重做所有工作
 *
 * 核心策略：
 * - 文献筛选：每批20-50篇
 * - 数据提取：每批10-20条
 * - 统计分析：按数据集大小动态调整
 */

/**
 * 任务类型的拆分策略
 */
export interface ChunkStrategy {
  /** 任务类型标识 */
  type: string

  /** 每批处理的数据量 */
  chunkSize: number

  /** 最大批次数（防止过度拆分） */
  maxChunks?: number

  /** 描述 */
  description: string
}

/**
 * 预定义的拆分策略
 *
 * 根据实际业务场景和性能测试数据配置
 */
export const CHUNK_STRATEGIES: Record<string, ChunkStrategy> = {
  // ASL模块：文献筛选
  'asl:screening:title-abstract': {
    type: 'asl:screening:title-abstract',
    chunkSize: 50, // 每批50篇（LLM API较快）
    maxChunks: 100, // 最多100批（5000篇）
    description: '标题/摘要筛选 - 每批50篇'
  },

  'asl:screening:full-text': {
    type: 'asl:screening:full-text',
    chunkSize: 20, // 每批20篇（全文较慢）
    maxChunks: 50, // 最多50批（1000篇）
    description: '全文筛选 - 每批20篇'
  },

  'asl:extraction': {
    type: 'asl:extraction',
    chunkSize: 30, // 每批30篇
    maxChunks: 50,
    description: '数据提取 - 每批30篇'
  },

  // DC模块：数据清洗
  'dc:clean:batch': {
    type: 'dc:clean:batch',
    chunkSize: 100, // 每批100行
    maxChunks: 100,
    description: '数据清洗 - 每批100行'
  },

  'dc:extract:medical-record': {
    type: 'dc:extract:medical-record',
    chunkSize: 10, // 每批10份病历（AI提取较慢）
    maxChunks: 100,
    description: '病历提取 - 每批10份'
  },

  // SSA模块：统计分析
  'ssa:analysis:batch': {
    type: 'ssa:analysis:batch',
    chunkSize: 1000, // 每批1000条数据
    maxChunks: 50,
    description: '统计分析 - 每批1000条'
  },

  // 默认策略
  'default': {
    type: 'default',
    chunkSize: 50,
    maxChunks: 100,
    description: '默认策略 - 每批50条'
  }
}

/**
 * 将数据数组拆分成多个批次
 *
 * @param items 要拆分的数据数组
 * @param chunkSize 每批的大小
 * @returns 拆分后的批次数组
 *
 * @example
 * ```typescript
 * const ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 * const batches = splitIntoChunks(ids, 3)
 * // 结果: [[1,2,3], [4,5,6], [7,8,9], [10]]
 * ```
 */
export function splitIntoChunks<T>(items: T[], chunkSize: number): T[][] {
  if (chunkSize <= 0) {
    throw new Error('chunkSize must be positive')
  }

  if (items.length === 0) {
    return []
  }

  const chunks: T[][] = []

  for (let i = 0; i < items.length; i += chunkSize) {
    chunks.push(items.slice(i, i + chunkSize))
  }

  return chunks
}

/**
 * 根据任务类型推荐批次大小
 *
 * @param taskType 任务类型（如：'asl:screening:title-abstract'）
 * @param totalItems 总数据量
 * @returns 推荐的批次大小
 *
 * @example
 * ```typescript
 * const chunkSize = recommendChunkSize('asl:screening:title-abstract', 1000)
 * // 返回: 50 (根据CHUNK_STRATEGIES配置)
 * ```
 */
export function recommendChunkSize(taskType: string, totalItems: number): number {
  // 查找对应的策略
  const strategy = CHUNK_STRATEGIES[taskType] || CHUNK_STRATEGIES['default']

  let chunkSize = strategy.chunkSize

  // 如果总量很小，不拆分
  if (totalItems <= chunkSize) {
    return totalItems
  }

  // 如果拆分后批次数超过maxChunks，增大chunkSize
  if (strategy.maxChunks) {
    const predictedChunks = Math.ceil(totalItems / chunkSize)
    if (predictedChunks > strategy.maxChunks) {
      chunkSize = Math.ceil(totalItems / strategy.maxChunks)
      console.log(
        `[TaskSplit] Adjusted chunkSize to ${chunkSize} to limit chunks to ${strategy.maxChunks}`
      )
    }
  }

  return chunkSize
}

/**
 * 计算任务拆分信息
 *
 * @param taskType 任务类型
 * @param totalItems 总数据量
 * @returns 拆分信息
 *
 * @example
 * ```typescript
 * const info = calculateSplitInfo('asl:screening:title-abstract', 1000)
 * // 返回: { chunkSize: 50, totalChunks: 20, strategy: {...} }
 * ```
 */
export function calculateSplitInfo(taskType: string, totalItems: number) {
  const strategy = CHUNK_STRATEGIES[taskType] || CHUNK_STRATEGIES['default']
  const chunkSize = recommendChunkSize(taskType, totalItems)
  const totalChunks = Math.ceil(totalItems / chunkSize)

  return {
    taskType,
    totalItems,
    chunkSize,
    totalChunks,
    strategy,
    avgItemsPerChunk: totalChunks > 0 ? Math.round(totalItems / totalChunks) : 0,
    lastChunkSize: totalItems % chunkSize || chunkSize
  }
}

/**
 * 获取批次索引的人类可读描述
 *
 * @param batchIndex 批次索引（从0开始）
 * @param totalBatches 总批次数
 * @returns 描述字符串
 *
 * @example
 * ```typescript
 * getBatchDescription(0, 20) // "批次 1/20"
 * getBatchDescription(19, 20) // "批次 20/20（最后一批）"
 * ```
 */
export function getBatchDescription(batchIndex: number, totalBatches: number): string {
  const humanIndex = batchIndex + 1

  if (humanIndex === totalBatches) {
    return `批次 ${humanIndex}/${totalBatches}（最后一批）`
  }

  return `批次 ${humanIndex}/${totalBatches}`
}

/**
 * 估算批次执行时间（秒）
 *
 * 基于经验值估算，用于前端显示预计完成时间
 *
 * @param taskType 任务类型
 * @param batchSize 批次大小
 * @returns 估算的执行时间（秒）
 */
export function estimateBatchDuration(taskType: string, batchSize: number): number {
  // 每项平均处理时间（秒）
  const TIME_PER_ITEM: Record<string, number> = {
    'asl:screening:title-abstract': 0.5, // 0.5秒/篇（含LLM调用）
    'asl:screening:full-text': 2, // 2秒/篇
    'asl:extraction': 3, // 3秒/篇
    'dc:clean:batch': 0.1, // 0.1秒/行
    'dc:extract:medical-record': 5, // 5秒/份
    'ssa:analysis:batch': 0.01, // 0.01秒/条
    'default': 1 // 1秒/条
  }

  const timePerItem = TIME_PER_ITEM[taskType] || TIME_PER_ITEM['default']

  return Math.ceil(batchSize * timePerItem)
}

/**
 * 验证批次索引是否有效
 *
 * @param batchIndex 批次索引
 * @param totalBatches 总批次数
 * @throws Error 如果索引无效
 */
export function validateBatchIndex(batchIndex: number, totalBatches: number): void {
  if (batchIndex < 0 || batchIndex >= totalBatches) {
    throw new Error(
      `Invalid batch index: ${batchIndex}. Must be between 0 and ${totalBatches - 1}`
    )
  }
}

/**
 * 从数组中提取指定批次的数据
 *
 * @param items 完整数据数组
 * @param batchIndex 批次索引（从0开始）
 * @param chunkSize 批次大小
 * @returns 该批次的数据
 *
 * @example
 * ```typescript
 * const ids = [1,2,3,4,5,6,7,8,9,10]
 * getBatchItems(ids, 0, 3) // [1,2,3]
 * getBatchItems(ids, 1, 3) // [4,5,6]
 * getBatchItems(ids, 3, 3) // [10]
 * ```
 */
export function getBatchItems<T>(
  items: T[],
  batchIndex: number,
  chunkSize: number
): T[] {
  const start = batchIndex * chunkSize
  const end = Math.min(start + chunkSize, items.length)

  return items.slice(start, end)
}