Summary: - Fix Prompt list API response schema missing activeVersion and draftVersion fields - Fastify was filtering out undefined schema fields, causing version columns to show empty - Add detailed diagnostic logging for Prompt debug mode troubleshooting - Verify debug mode works correctly (DRAFT version is used when debug enabled) Changes: - backend/src/common/prompt/prompt.routes.ts: Add activeVersion and draftVersion to response schema - backend/src/common/prompt/prompt.service.ts: Add diagnostic logs for setDebugMode and get methods - PKB module: Various authentication and document handling fixes from previous session Tested: Debug mode verified working - v2 DRAFT version correctly loaded when debug enabled
324 lines
7.2 KiB
TypeScript
324 lines
7.2 KiB
TypeScript
/**
|
||
* 任务拆分工具函数
|
||
*
|
||
* 用于将长时间任务拆分成多个小任务,避免:
|
||
* - SAE 30秒超时
|
||
* - pg-boss 24小时任务过期
|
||
* - 任务失败时重做所有工作
|
||
*
|
||
* 核心策略:
|
||
* - 文献筛选:每批20-50篇
|
||
* - 数据提取:每批10-20条
|
||
* - 统计分析:按数据集大小动态调整
|
||
*/
|
||
|
||
/**
|
||
* 任务类型的拆分策略
|
||
*/
|
||
export interface ChunkStrategy {
|
||
/** 任务类型标识 */
|
||
type: string
|
||
|
||
/** 每批处理的数据量 */
|
||
chunkSize: number
|
||
|
||
/** 最大批次数(防止过度拆分) */
|
||
maxChunks?: number
|
||
|
||
/** 描述 */
|
||
description: string
|
||
}
|
||
|
||
/**
|
||
* 预定义的拆分策略
|
||
*
|
||
* 根据实际业务场景和性能测试数据配置
|
||
*/
|
||
export const CHUNK_STRATEGIES: Record<string, ChunkStrategy> = {
|
||
// ASL模块:文献筛选
|
||
'asl:screening:title-abstract': {
|
||
type: 'asl:screening:title-abstract',
|
||
chunkSize: 50, // 每批50篇(LLM API较快)
|
||
maxChunks: 100, // 最多100批(5000篇)
|
||
description: '标题/摘要筛选 - 每批50篇'
|
||
},
|
||
|
||
'asl:screening:full-text': {
|
||
type: 'asl:screening:full-text',
|
||
chunkSize: 20, // 每批20篇(全文较慢)
|
||
maxChunks: 50, // 最多50批(1000篇)
|
||
description: '全文筛选 - 每批20篇'
|
||
},
|
||
|
||
'asl:extraction': {
|
||
type: 'asl:extraction',
|
||
chunkSize: 30, // 每批30篇
|
||
maxChunks: 50,
|
||
description: '数据提取 - 每批30篇'
|
||
},
|
||
|
||
// DC模块:数据清洗
|
||
'dc:clean:batch': {
|
||
type: 'dc:clean:batch',
|
||
chunkSize: 100, // 每批100行
|
||
maxChunks: 100,
|
||
description: '数据清洗 - 每批100行'
|
||
},
|
||
|
||
'dc:extract:medical-record': {
|
||
type: 'dc:extract:medical-record',
|
||
chunkSize: 10, // 每批10份病历(AI提取较慢)
|
||
maxChunks: 100,
|
||
description: '病历提取 - 每批10份'
|
||
},
|
||
|
||
// SSA模块:统计分析
|
||
'ssa:analysis:batch': {
|
||
type: 'ssa:analysis:batch',
|
||
chunkSize: 1000, // 每批1000条数据
|
||
maxChunks: 50,
|
||
description: '统计分析 - 每批1000条'
|
||
},
|
||
|
||
// 默认策略
|
||
'default': {
|
||
type: 'default',
|
||
chunkSize: 50,
|
||
maxChunks: 100,
|
||
description: '默认策略 - 每批50条'
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 将数据数组拆分成多个批次
|
||
*
|
||
* @param items 要拆分的数据数组
|
||
* @param chunkSize 每批的大小
|
||
* @returns 拆分后的批次数组
|
||
*
|
||
* @example
|
||
* ```typescript
|
||
* const ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
||
* const batches = splitIntoChunks(ids, 3)
|
||
* // 结果: [[1,2,3], [4,5,6], [7,8,9], [10]]
|
||
* ```
|
||
*/
|
||
export function splitIntoChunks<T>(items: T[], chunkSize: number): T[][] {
|
||
if (chunkSize <= 0) {
|
||
throw new Error('chunkSize must be positive')
|
||
}
|
||
|
||
if (items.length === 0) {
|
||
return []
|
||
}
|
||
|
||
const chunks: T[][] = []
|
||
|
||
for (let i = 0; i < items.length; i += chunkSize) {
|
||
chunks.push(items.slice(i, i + chunkSize))
|
||
}
|
||
|
||
return chunks
|
||
}
|
||
|
||
/**
|
||
* 根据任务类型推荐批次大小
|
||
*
|
||
* @param taskType 任务类型(如:'asl:screening:title-abstract')
|
||
* @param totalItems 总数据量
|
||
* @returns 推荐的批次大小
|
||
*
|
||
* @example
|
||
* ```typescript
|
||
* const chunkSize = recommendChunkSize('asl:screening:title-abstract', 1000)
|
||
* // 返回: 50 (根据CHUNK_STRATEGIES配置)
|
||
* ```
|
||
*/
|
||
export function recommendChunkSize(taskType: string, totalItems: number): number {
|
||
// 查找对应的策略
|
||
const strategy = CHUNK_STRATEGIES[taskType] || CHUNK_STRATEGIES['default']
|
||
|
||
let chunkSize = strategy.chunkSize
|
||
|
||
// 如果总量很小,不拆分
|
||
if (totalItems <= chunkSize) {
|
||
return totalItems
|
||
}
|
||
|
||
// 如果拆分后批次数超过maxChunks,增大chunkSize
|
||
if (strategy.maxChunks) {
|
||
const predictedChunks = Math.ceil(totalItems / chunkSize)
|
||
if (predictedChunks > strategy.maxChunks) {
|
||
chunkSize = Math.ceil(totalItems / strategy.maxChunks)
|
||
console.log(
|
||
`[TaskSplit] Adjusted chunkSize to ${chunkSize} to limit chunks to ${strategy.maxChunks}`
|
||
)
|
||
}
|
||
}
|
||
|
||
return chunkSize
|
||
}
|
||
|
||
/**
|
||
* 计算任务拆分信息
|
||
*
|
||
* @param taskType 任务类型
|
||
* @param totalItems 总数据量
|
||
* @returns 拆分信息
|
||
*
|
||
* @example
|
||
* ```typescript
|
||
* const info = calculateSplitInfo('asl:screening:title-abstract', 1000)
|
||
* // 返回: { chunkSize: 50, totalChunks: 20, strategy: {...} }
|
||
* ```
|
||
*/
|
||
export function calculateSplitInfo(taskType: string, totalItems: number) {
|
||
const strategy = CHUNK_STRATEGIES[taskType] || CHUNK_STRATEGIES['default']
|
||
const chunkSize = recommendChunkSize(taskType, totalItems)
|
||
const totalChunks = Math.ceil(totalItems / chunkSize)
|
||
|
||
return {
|
||
taskType,
|
||
totalItems,
|
||
chunkSize,
|
||
totalChunks,
|
||
strategy,
|
||
avgItemsPerChunk: totalChunks > 0 ? Math.round(totalItems / totalChunks) : 0,
|
||
lastChunkSize: totalItems % chunkSize || chunkSize
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 获取批次索引的人类可读描述
|
||
*
|
||
* @param batchIndex 批次索引(从0开始)
|
||
* @param totalBatches 总批次数
|
||
* @returns 描述字符串
|
||
*
|
||
* @example
|
||
* ```typescript
|
||
* getBatchDescription(0, 20) // "批次 1/20"
|
||
* getBatchDescription(19, 20) // "批次 20/20(最后一批)"
|
||
* ```
|
||
*/
|
||
export function getBatchDescription(batchIndex: number, totalBatches: number): string {
|
||
const humanIndex = batchIndex + 1
|
||
|
||
if (humanIndex === totalBatches) {
|
||
return `批次 ${humanIndex}/${totalBatches}(最后一批)`
|
||
}
|
||
|
||
return `批次 ${humanIndex}/${totalBatches}`
|
||
}
|
||
|
||
/**
|
||
* 估算批次执行时间(秒)
|
||
*
|
||
* 基于经验值估算,用于前端显示预计完成时间
|
||
*
|
||
* @param taskType 任务类型
|
||
* @param batchSize 批次大小
|
||
* @returns 估算的执行时间(秒)
|
||
*/
|
||
export function estimateBatchDuration(taskType: string, batchSize: number): number {
|
||
// 每项平均处理时间(秒)
|
||
const TIME_PER_ITEM: Record<string, number> = {
|
||
'asl:screening:title-abstract': 0.5, // 0.5秒/篇(含LLM调用)
|
||
'asl:screening:full-text': 2, // 2秒/篇
|
||
'asl:extraction': 3, // 3秒/篇
|
||
'dc:clean:batch': 0.1, // 0.1秒/行
|
||
'dc:extract:medical-record': 5, // 5秒/份
|
||
'ssa:analysis:batch': 0.01, // 0.01秒/条
|
||
'default': 1 // 1秒/条
|
||
}
|
||
|
||
const timePerItem = TIME_PER_ITEM[taskType] || TIME_PER_ITEM['default']
|
||
|
||
return Math.ceil(batchSize * timePerItem)
|
||
}
|
||
|
||
/**
|
||
* 验证批次索引是否有效
|
||
*
|
||
* @param batchIndex 批次索引
|
||
* @param totalBatches 总批次数
|
||
* @throws Error 如果索引无效
|
||
*/
|
||
export function validateBatchIndex(batchIndex: number, totalBatches: number): void {
|
||
if (batchIndex < 0 || batchIndex >= totalBatches) {
|
||
throw new Error(
|
||
`Invalid batch index: ${batchIndex}. Must be between 0 and ${totalBatches - 1}`
|
||
)
|
||
}
|
||
}
|
||
|
||
/**
|
||
* 从数组中提取指定批次的数据
|
||
*
|
||
* @param items 完整数据数组
|
||
* @param batchIndex 批次索引(从0开始)
|
||
* @param chunkSize 批次大小
|
||
* @returns 该批次的数据
|
||
*
|
||
* @example
|
||
* ```typescript
|
||
* const ids = [1,2,3,4,5,6,7,8,9,10]
|
||
* getBatchItems(ids, 0, 3) // [1,2,3]
|
||
* getBatchItems(ids, 1, 3) // [4,5,6]
|
||
* getBatchItems(ids, 3, 3) // [10]
|
||
* ```
|
||
*/
|
||
export function getBatchItems<T>(
|
||
items: T[],
|
||
batchIndex: number,
|
||
chunkSize: number
|
||
): T[] {
|
||
const start = batchIndex * chunkSize
|
||
const end = Math.min(start + chunkSize, items.length)
|
||
|
||
return items.slice(start, end)
|
||
}
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|