feat(platform): Complete Postgres-Only architecture refactoring (Phase 1-7)
Major Changes: - Implement Platform-Only architecture pattern (unified task management) - Add PostgresCacheAdapter for unified caching (platform_schema.app_cache) - Add PgBossQueue for job queue management (platform_schema.job) - Implement CheckpointService using job.data (generic for all modules) - Add intelligent threshold-based dual-mode processing (THRESHOLD=50) - Add task splitting mechanism (auto chunk size recommendation) - Refactor ASL screening service with smart mode selection - Refactor DC extraction service with smart mode selection - Register workers for ASL and DC modules Technical Highlights: - All task management data stored in platform_schema.job.data (JSONB) - Business tables remain clean (no task management fields) - CheckpointService is generic (shared by all modules) - Zero code duplication (DRY principle) - Follows 3-layer architecture principle - Zero additional cost (no Redis needed, save 8400 CNY/year) Code Statistics: - New code: ~1750 lines - Modified code: ~500 lines - Test code: ~1800 lines - Documentation: ~3000 lines Testing: - Unit tests: 8/8 passed - Integration tests: 2/2 passed - Architecture validation: passed - Linter errors: 0 Files: - Platform layer: PostgresCacheAdapter, PgBossQueue, CheckpointService, utils - ASL module: screeningService, screeningWorker - DC module: ExtractionController, extractionWorker - Tests: 11 test files - Docs: Updated 4 key documents Status: Phase 1-7 completed, Phase 8-9 pending
This commit is contained in:
282
backend/src/common/jobs/utils.ts
Normal file
282
backend/src/common/jobs/utils.ts
Normal file
@@ -0,0 +1,282 @@
|
||||
/**
|
||||
* 任务拆分工具函数
|
||||
*
|
||||
* 用于将长时间任务拆分成多个小任务,避免:
|
||||
* - SAE 30秒超时
|
||||
* - pg-boss 24小时任务过期
|
||||
* - 任务失败时重做所有工作
|
||||
*
|
||||
* 核心策略:
|
||||
* - 文献筛选:每批20-50篇
|
||||
* - 数据提取:每批10-20条
|
||||
* - 统计分析:按数据集大小动态调整
|
||||
*/
|
||||
|
||||
/**
|
||||
* 任务类型的拆分策略
|
||||
*/
|
||||
export interface ChunkStrategy {
|
||||
/** 任务类型标识 */
|
||||
type: string
|
||||
|
||||
/** 每批处理的数据量 */
|
||||
chunkSize: number
|
||||
|
||||
/** 最大批次数(防止过度拆分) */
|
||||
maxChunks?: number
|
||||
|
||||
/** 描述 */
|
||||
description: string
|
||||
}
|
||||
|
||||
/**
|
||||
* 预定义的拆分策略
|
||||
*
|
||||
* 根据实际业务场景和性能测试数据配置
|
||||
*/
|
||||
export const CHUNK_STRATEGIES: Record<string, ChunkStrategy> = {
|
||||
// ASL模块:文献筛选
|
||||
'asl:screening:title-abstract': {
|
||||
type: 'asl:screening:title-abstract',
|
||||
chunkSize: 50, // 每批50篇(LLM API较快)
|
||||
maxChunks: 100, // 最多100批(5000篇)
|
||||
description: '标题/摘要筛选 - 每批50篇'
|
||||
},
|
||||
|
||||
'asl:screening:full-text': {
|
||||
type: 'asl:screening:full-text',
|
||||
chunkSize: 20, // 每批20篇(全文较慢)
|
||||
maxChunks: 50, // 最多50批(1000篇)
|
||||
description: '全文筛选 - 每批20篇'
|
||||
},
|
||||
|
||||
'asl:extraction': {
|
||||
type: 'asl:extraction',
|
||||
chunkSize: 30, // 每批30篇
|
||||
maxChunks: 50,
|
||||
description: '数据提取 - 每批30篇'
|
||||
},
|
||||
|
||||
// DC模块:数据清洗
|
||||
'dc:clean:batch': {
|
||||
type: 'dc:clean:batch',
|
||||
chunkSize: 100, // 每批100行
|
||||
maxChunks: 100,
|
||||
description: '数据清洗 - 每批100行'
|
||||
},
|
||||
|
||||
'dc:extract:medical-record': {
|
||||
type: 'dc:extract:medical-record',
|
||||
chunkSize: 10, // 每批10份病历(AI提取较慢)
|
||||
maxChunks: 100,
|
||||
description: '病历提取 - 每批10份'
|
||||
},
|
||||
|
||||
// SSA模块:统计分析
|
||||
'ssa:analysis:batch': {
|
||||
type: 'ssa:analysis:batch',
|
||||
chunkSize: 1000, // 每批1000条数据
|
||||
maxChunks: 50,
|
||||
description: '统计分析 - 每批1000条'
|
||||
},
|
||||
|
||||
// 默认策略
|
||||
'default': {
|
||||
type: 'default',
|
||||
chunkSize: 50,
|
||||
maxChunks: 100,
|
||||
description: '默认策略 - 每批50条'
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 将数据数组拆分成多个批次
|
||||
*
|
||||
* @param items 要拆分的数据数组
|
||||
* @param chunkSize 每批的大小
|
||||
* @returns 拆分后的批次数组
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
||||
* const batches = splitIntoChunks(ids, 3)
|
||||
* // 结果: [[1,2,3], [4,5,6], [7,8,9], [10]]
|
||||
* ```
|
||||
*/
|
||||
export function splitIntoChunks<T>(items: T[], chunkSize: number): T[][] {
|
||||
if (chunkSize <= 0) {
|
||||
throw new Error('chunkSize must be positive')
|
||||
}
|
||||
|
||||
if (items.length === 0) {
|
||||
return []
|
||||
}
|
||||
|
||||
const chunks: T[][] = []
|
||||
|
||||
for (let i = 0; i < items.length; i += chunkSize) {
|
||||
chunks.push(items.slice(i, i + chunkSize))
|
||||
}
|
||||
|
||||
return chunks
|
||||
}
|
||||
|
||||
/**
|
||||
* 根据任务类型推荐批次大小
|
||||
*
|
||||
* @param taskType 任务类型(如:'asl:screening:title-abstract')
|
||||
* @param totalItems 总数据量
|
||||
* @returns 推荐的批次大小
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const chunkSize = recommendChunkSize('asl:screening:title-abstract', 1000)
|
||||
* // 返回: 50 (根据CHUNK_STRATEGIES配置)
|
||||
* ```
|
||||
*/
|
||||
export function recommendChunkSize(taskType: string, totalItems: number): number {
|
||||
// 查找对应的策略
|
||||
const strategy = CHUNK_STRATEGIES[taskType] || CHUNK_STRATEGIES['default']
|
||||
|
||||
let chunkSize = strategy.chunkSize
|
||||
|
||||
// 如果总量很小,不拆分
|
||||
if (totalItems <= chunkSize) {
|
||||
return totalItems
|
||||
}
|
||||
|
||||
// 如果拆分后批次数超过maxChunks,增大chunkSize
|
||||
if (strategy.maxChunks) {
|
||||
const predictedChunks = Math.ceil(totalItems / chunkSize)
|
||||
if (predictedChunks > strategy.maxChunks) {
|
||||
chunkSize = Math.ceil(totalItems / strategy.maxChunks)
|
||||
console.log(
|
||||
`[TaskSplit] Adjusted chunkSize to ${chunkSize} to limit chunks to ${strategy.maxChunks}`
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return chunkSize
|
||||
}
|
||||
|
||||
/**
|
||||
* 计算任务拆分信息
|
||||
*
|
||||
* @param taskType 任务类型
|
||||
* @param totalItems 总数据量
|
||||
* @returns 拆分信息
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const info = calculateSplitInfo('asl:screening:title-abstract', 1000)
|
||||
* // 返回: { chunkSize: 50, totalChunks: 20, strategy: {...} }
|
||||
* ```
|
||||
*/
|
||||
export function calculateSplitInfo(taskType: string, totalItems: number) {
|
||||
const strategy = CHUNK_STRATEGIES[taskType] || CHUNK_STRATEGIES['default']
|
||||
const chunkSize = recommendChunkSize(taskType, totalItems)
|
||||
const totalChunks = Math.ceil(totalItems / chunkSize)
|
||||
|
||||
return {
|
||||
taskType,
|
||||
totalItems,
|
||||
chunkSize,
|
||||
totalChunks,
|
||||
strategy,
|
||||
avgItemsPerChunk: totalChunks > 0 ? Math.round(totalItems / totalChunks) : 0,
|
||||
lastChunkSize: totalItems % chunkSize || chunkSize
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取批次索引的人类可读描述
|
||||
*
|
||||
* @param batchIndex 批次索引(从0开始)
|
||||
* @param totalBatches 总批次数
|
||||
* @returns 描述字符串
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* getBatchDescription(0, 20) // "批次 1/20"
|
||||
* getBatchDescription(19, 20) // "批次 20/20(最后一批)"
|
||||
* ```
|
||||
*/
|
||||
export function getBatchDescription(batchIndex: number, totalBatches: number): string {
|
||||
const humanIndex = batchIndex + 1
|
||||
|
||||
if (humanIndex === totalBatches) {
|
||||
return `批次 ${humanIndex}/${totalBatches}(最后一批)`
|
||||
}
|
||||
|
||||
return `批次 ${humanIndex}/${totalBatches}`
|
||||
}
|
||||
|
||||
/**
|
||||
* 估算批次执行时间(秒)
|
||||
*
|
||||
* 基于经验值估算,用于前端显示预计完成时间
|
||||
*
|
||||
* @param taskType 任务类型
|
||||
* @param batchSize 批次大小
|
||||
* @returns 估算的执行时间(秒)
|
||||
*/
|
||||
export function estimateBatchDuration(taskType: string, batchSize: number): number {
|
||||
// 每项平均处理时间(秒)
|
||||
const TIME_PER_ITEM: Record<string, number> = {
|
||||
'asl:screening:title-abstract': 0.5, // 0.5秒/篇(含LLM调用)
|
||||
'asl:screening:full-text': 2, // 2秒/篇
|
||||
'asl:extraction': 3, // 3秒/篇
|
||||
'dc:clean:batch': 0.1, // 0.1秒/行
|
||||
'dc:extract:medical-record': 5, // 5秒/份
|
||||
'ssa:analysis:batch': 0.01, // 0.01秒/条
|
||||
'default': 1 // 1秒/条
|
||||
}
|
||||
|
||||
const timePerItem = TIME_PER_ITEM[taskType] || TIME_PER_ITEM['default']
|
||||
|
||||
return Math.ceil(batchSize * timePerItem)
|
||||
}
|
||||
|
||||
/**
|
||||
* 验证批次索引是否有效
|
||||
*
|
||||
* @param batchIndex 批次索引
|
||||
* @param totalBatches 总批次数
|
||||
* @throws Error 如果索引无效
|
||||
*/
|
||||
export function validateBatchIndex(batchIndex: number, totalBatches: number): void {
|
||||
if (batchIndex < 0 || batchIndex >= totalBatches) {
|
||||
throw new Error(
|
||||
`Invalid batch index: ${batchIndex}. Must be between 0 and ${totalBatches - 1}`
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 从数组中提取指定批次的数据
|
||||
*
|
||||
* @param items 完整数据数组
|
||||
* @param batchIndex 批次索引(从0开始)
|
||||
* @param chunkSize 批次大小
|
||||
* @returns 该批次的数据
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const ids = [1,2,3,4,5,6,7,8,9,10]
|
||||
* getBatchItems(ids, 0, 3) // [1,2,3]
|
||||
* getBatchItems(ids, 1, 3) // [4,5,6]
|
||||
* getBatchItems(ids, 3, 3) // [10]
|
||||
* ```
|
||||
*/
|
||||
export function getBatchItems<T>(
|
||||
items: T[],
|
||||
batchIndex: number,
|
||||
chunkSize: number
|
||||
): T[] {
|
||||
const start = batchIndex * chunkSize
|
||||
const end = Math.min(start + chunkSize, items.length)
|
||||
|
||||
return items.slice(start, end)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user