feat(platform): Complete Postgres-Only architecture refactoring (Phase 1-7)

Major Changes:
- Implement Platform-Only architecture pattern (unified task management)
- Add PostgresCacheAdapter for unified caching (platform_schema.app_cache)
- Add PgBossQueue for job queue management (platform_schema.job)
- Implement CheckpointService using job.data (generic for all modules)
- Add intelligent threshold-based dual-mode processing (THRESHOLD=50)
- Add task splitting mechanism (auto chunk size recommendation)
- Refactor ASL screening service with smart mode selection
- Refactor DC extraction service with smart mode selection
- Register workers for ASL and DC modules

Technical Highlights:
- All task management data stored in platform_schema.job.data (JSONB)
- Business tables remain clean (no task management fields)
- CheckpointService is generic (shared by all modules)
- Zero code duplication (DRY principle)
- Follows 3-layer architecture principle
- Zero additional cost (no Redis needed, save 8400 CNY/year)

Code Statistics:
- New code: ~1750 lines
- Modified code: ~500 lines
- Test code: ~1800 lines
- Documentation: ~3000 lines

Testing:
- Unit tests: 8/8 passed
- Integration tests: 2/2 passed
- Architecture validation: passed
- Linter errors: 0

Files:
- Platform layer: PostgresCacheAdapter, PgBossQueue, CheckpointService, utils
- ASL module: screeningService, screeningWorker
- DC module: ExtractionController, extractionWorker
- Tests: 11 test files
- Docs: Updated 4 key documents

Status: Phase 1-7 completed, Phase 8-9 pending
This commit is contained in:
2025-12-13 16:10:04 +08:00
parent a3586cdf30
commit fa72beea6c
135 changed files with 17508 additions and 91 deletions

View File

@@ -0,0 +1,282 @@
/**
* 任务拆分工具函数
*
* 用于将长时间任务拆分成多个小任务,避免:
* - SAE 30秒超时
* - pg-boss 24小时任务过期
* - 任务失败时重做所有工作
*
* 核心策略:
* - 文献筛选每批20-50篇
* - 数据提取每批10-20条
* - 统计分析:按数据集大小动态调整
*/
/**
* 任务类型的拆分策略
*/
export interface ChunkStrategy {
/** 任务类型标识 */
type: string
/** 每批处理的数据量 */
chunkSize: number
/** 最大批次数(防止过度拆分) */
maxChunks?: number
/** 描述 */
description: string
}
/**
* 预定义的拆分策略
*
* 根据实际业务场景和性能测试数据配置
*/
export const CHUNK_STRATEGIES: Record<string, ChunkStrategy> = {
// ASL模块文献筛选
'asl:screening:title-abstract': {
type: 'asl:screening:title-abstract',
chunkSize: 50, // 每批50篇LLM API较快
maxChunks: 100, // 最多100批5000篇
description: '标题/摘要筛选 - 每批50篇'
},
'asl:screening:full-text': {
type: 'asl:screening:full-text',
chunkSize: 20, // 每批20篇全文较慢
maxChunks: 50, // 最多50批1000篇
description: '全文筛选 - 每批20篇'
},
'asl:extraction': {
type: 'asl:extraction',
chunkSize: 30, // 每批30篇
maxChunks: 50,
description: '数据提取 - 每批30篇'
},
// DC模块数据清洗
'dc:clean:batch': {
type: 'dc:clean:batch',
chunkSize: 100, // 每批100行
maxChunks: 100,
description: '数据清洗 - 每批100行'
},
'dc:extract:medical-record': {
type: 'dc:extract:medical-record',
chunkSize: 10, // 每批10份病历AI提取较慢
maxChunks: 100,
description: '病历提取 - 每批10份'
},
// SSA模块统计分析
'ssa:analysis:batch': {
type: 'ssa:analysis:batch',
chunkSize: 1000, // 每批1000条数据
maxChunks: 50,
description: '统计分析 - 每批1000条'
},
// 默认策略
'default': {
type: 'default',
chunkSize: 50,
maxChunks: 100,
description: '默认策略 - 每批50条'
}
}
/**
* 将数据数组拆分成多个批次
*
* @param items 要拆分的数据数组
* @param chunkSize 每批的大小
* @returns 拆分后的批次数组
*
* @example
* ```typescript
* const ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
* const batches = splitIntoChunks(ids, 3)
* // 结果: [[1,2,3], [4,5,6], [7,8,9], [10]]
* ```
*/
export function splitIntoChunks<T>(items: T[], chunkSize: number): T[][] {
if (chunkSize <= 0) {
throw new Error('chunkSize must be positive')
}
if (items.length === 0) {
return []
}
const chunks: T[][] = []
for (let i = 0; i < items.length; i += chunkSize) {
chunks.push(items.slice(i, i + chunkSize))
}
return chunks
}
/**
* 根据任务类型推荐批次大小
*
* @param taskType 任务类型(如:'asl:screening:title-abstract'
* @param totalItems 总数据量
* @returns 推荐的批次大小
*
* @example
* ```typescript
* const chunkSize = recommendChunkSize('asl:screening:title-abstract', 1000)
* // 返回: 50 (根据CHUNK_STRATEGIES配置)
* ```
*/
export function recommendChunkSize(taskType: string, totalItems: number): number {
// 查找对应的策略
const strategy = CHUNK_STRATEGIES[taskType] || CHUNK_STRATEGIES['default']
let chunkSize = strategy.chunkSize
// 如果总量很小,不拆分
if (totalItems <= chunkSize) {
return totalItems
}
// 如果拆分后批次数超过maxChunks增大chunkSize
if (strategy.maxChunks) {
const predictedChunks = Math.ceil(totalItems / chunkSize)
if (predictedChunks > strategy.maxChunks) {
chunkSize = Math.ceil(totalItems / strategy.maxChunks)
console.log(
`[TaskSplit] Adjusted chunkSize to ${chunkSize} to limit chunks to ${strategy.maxChunks}`
)
}
}
return chunkSize
}
/**
* 计算任务拆分信息
*
* @param taskType 任务类型
* @param totalItems 总数据量
* @returns 拆分信息
*
* @example
* ```typescript
* const info = calculateSplitInfo('asl:screening:title-abstract', 1000)
* // 返回: { chunkSize: 50, totalChunks: 20, strategy: {...} }
* ```
*/
export function calculateSplitInfo(taskType: string, totalItems: number) {
const strategy = CHUNK_STRATEGIES[taskType] || CHUNK_STRATEGIES['default']
const chunkSize = recommendChunkSize(taskType, totalItems)
const totalChunks = Math.ceil(totalItems / chunkSize)
return {
taskType,
totalItems,
chunkSize,
totalChunks,
strategy,
avgItemsPerChunk: totalChunks > 0 ? Math.round(totalItems / totalChunks) : 0,
lastChunkSize: totalItems % chunkSize || chunkSize
}
}
/**
* 获取批次索引的人类可读描述
*
* @param batchIndex 批次索引从0开始
* @param totalBatches 总批次数
* @returns 描述字符串
*
* @example
* ```typescript
* getBatchDescription(0, 20) // "批次 1/20"
* getBatchDescription(19, 20) // "批次 20/20最后一批"
* ```
*/
export function getBatchDescription(batchIndex: number, totalBatches: number): string {
const humanIndex = batchIndex + 1
if (humanIndex === totalBatches) {
return `批次 ${humanIndex}/${totalBatches}(最后一批)`
}
return `批次 ${humanIndex}/${totalBatches}`
}
/**
* 估算批次执行时间(秒)
*
* 基于经验值估算,用于前端显示预计完成时间
*
* @param taskType 任务类型
* @param batchSize 批次大小
* @returns 估算的执行时间(秒)
*/
export function estimateBatchDuration(taskType: string, batchSize: number): number {
// 每项平均处理时间(秒)
const TIME_PER_ITEM: Record<string, number> = {
'asl:screening:title-abstract': 0.5, // 0.5秒/篇含LLM调用
'asl:screening:full-text': 2, // 2秒/篇
'asl:extraction': 3, // 3秒/篇
'dc:clean:batch': 0.1, // 0.1秒/行
'dc:extract:medical-record': 5, // 5秒/份
'ssa:analysis:batch': 0.01, // 0.01秒/条
'default': 1 // 1秒/条
}
const timePerItem = TIME_PER_ITEM[taskType] || TIME_PER_ITEM['default']
return Math.ceil(batchSize * timePerItem)
}
/**
* 验证批次索引是否有效
*
* @param batchIndex 批次索引
* @param totalBatches 总批次数
* @throws Error 如果索引无效
*/
export function validateBatchIndex(batchIndex: number, totalBatches: number): void {
if (batchIndex < 0 || batchIndex >= totalBatches) {
throw new Error(
`Invalid batch index: ${batchIndex}. Must be between 0 and ${totalBatches - 1}`
)
}
}
/**
* 从数组中提取指定批次的数据
*
* @param items 完整数据数组
* @param batchIndex 批次索引从0开始
* @param chunkSize 批次大小
* @returns 该批次的数据
*
* @example
* ```typescript
* const ids = [1,2,3,4,5,6,7,8,9,10]
* getBatchItems(ids, 0, 3) // [1,2,3]
* getBatchItems(ids, 1, 3) // [4,5,6]
* getBatchItems(ids, 3, 3) // [10]
* ```
*/
export function getBatchItems<T>(
items: T[],
batchIndex: number,
chunkSize: number
): T[] {
const start = batchIndex * chunkSize
const end = Math.min(start + chunkSize, items.length)
return items.slice(start, end)
}