feat(pkb): Complete PKB module frontend migration with V3 design
Summary: - Implement PKB Dashboard and Workspace pages based on V3 prototype - Add single-layer header with integrated Tab navigation - Implement 3 work modes: Full Text, Deep Read, Batch Processing - Integrate Ant Design X Chat component for AI conversations - Create BatchModeComplete with template selection and document processing - Add compact work mode selector with dropdown design Backend: - Migrate PKB controllers and services to /modules/pkb structure - Register v2 API routes at /api/v2/pkb/knowledge - Maintain dual API routes for backward compatibility Technical details: - Use Zustand for state management - Handle SSE streaming responses for AI chat - Support document selection for Deep Read mode - Implement batch processing with progress tracking Known issues: - Batch processing API integration pending - Knowledge assets page navigation needs optimization Status: Frontend functional, pending refinement
This commit is contained in:
428
backend/src/modules/pkb/controllers/batchController.ts
Normal file
428
backend/src/modules/pkb/controllers/batchController.ts
Normal file
@@ -0,0 +1,428 @@
|
||||
/**
|
||||
* Phase 3: 批处理模式 - 批处理控制器
|
||||
*
|
||||
* API路由:
|
||||
* - POST /api/v1/batch/execute - 执行批处理任务
|
||||
* - GET /api/v1/batch/tasks/:taskId - 获取任务状态
|
||||
* - GET /api/v1/batch/tasks/:taskId/results - 获取任务结果
|
||||
* - POST /api/v1/batch/tasks/:taskId/retry-failed - 重试失败项
|
||||
*/
|
||||
|
||||
import { FastifyRequest, FastifyReply } from 'fastify';
|
||||
import { executeBatchTask, retryFailedDocuments, BatchProgress } from '../services/batchService.js';
|
||||
import { prisma } from '../../../config/database.js';
|
||||
import { ModelType } from '../../../common/llm/adapters/types.js';
|
||||
|
||||
// ==================== 类型定义 ====================
|
||||
|
||||
interface ExecuteBatchBody {
|
||||
kb_id: string;
|
||||
document_ids: string[];
|
||||
template_type: 'preset' | 'custom';
|
||||
template_id?: string;
|
||||
custom_prompt?: string;
|
||||
model_type: ModelType;
|
||||
task_name?: string;
|
||||
}
|
||||
|
||||
interface TaskIdParams {
|
||||
taskId: string;
|
||||
}
|
||||
|
||||
// ==================== API处理器 ====================
|
||||
|
||||
/**
|
||||
* POST /api/v1/batch/execute
|
||||
* 执行批处理任务
|
||||
*/
|
||||
export async function executeBatch(
|
||||
request: FastifyRequest<{ Body: ExecuteBatchBody }>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
// TODO: 从JWT获取userId
|
||||
const userId = 'user-mock-001';
|
||||
|
||||
const {
|
||||
kb_id,
|
||||
document_ids,
|
||||
template_type,
|
||||
template_id,
|
||||
custom_prompt,
|
||||
model_type,
|
||||
task_name,
|
||||
} = request.body;
|
||||
|
||||
console.log('📦 [BatchController] 收到批处理请求', {
|
||||
userId,
|
||||
kbId: kb_id,
|
||||
documentCount: document_ids.length,
|
||||
templateType: template_type,
|
||||
modelType: model_type,
|
||||
});
|
||||
|
||||
// 验证参数
|
||||
if (!kb_id || !document_ids || document_ids.length === 0) {
|
||||
return reply.code(400).send({
|
||||
success: false,
|
||||
message: '缺少必要参数:kb_id 或 document_ids',
|
||||
});
|
||||
}
|
||||
|
||||
if (document_ids.length < 3) {
|
||||
return reply.code(400).send({
|
||||
success: false,
|
||||
message: '文献数量不能少于3篇',
|
||||
});
|
||||
}
|
||||
|
||||
if (document_ids.length > 50) {
|
||||
return reply.code(400).send({
|
||||
success: false,
|
||||
message: '文献数量不能超过50篇',
|
||||
});
|
||||
}
|
||||
|
||||
if (template_type === 'preset' && !template_id) {
|
||||
return reply.code(400).send({
|
||||
success: false,
|
||||
message: '预设模板类型需要提供 template_id',
|
||||
});
|
||||
}
|
||||
|
||||
if (template_type === 'custom' && !custom_prompt) {
|
||||
return reply.code(400).send({
|
||||
success: false,
|
||||
message: '自定义模板需要提供 custom_prompt',
|
||||
});
|
||||
}
|
||||
|
||||
// 验证模型类型
|
||||
const validModels: ModelType[] = ['deepseek-v3', 'qwen3-72b', 'qwen-long'];
|
||||
if (!validModels.includes(model_type)) {
|
||||
return reply.code(400).send({
|
||||
success: false,
|
||||
message: `不支持的模型类型: ${model_type}`,
|
||||
});
|
||||
}
|
||||
|
||||
// 验证知识库是否存在
|
||||
const kb = await prisma.knowledgeBase.findUnique({
|
||||
where: { id: kb_id },
|
||||
});
|
||||
|
||||
if (!kb) {
|
||||
return reply.code(404).send({
|
||||
success: false,
|
||||
message: `知识库不存在: ${kb_id}`,
|
||||
});
|
||||
}
|
||||
|
||||
// 验证文档是否都存在
|
||||
const documents = await prisma.document.findMany({
|
||||
where: {
|
||||
id: { in: document_ids },
|
||||
kbId: kb_id,
|
||||
},
|
||||
});
|
||||
|
||||
if (documents.length !== document_ids.length) {
|
||||
return reply.code(400).send({
|
||||
success: false,
|
||||
message: `部分文档不存在或不属于该知识库`,
|
||||
});
|
||||
}
|
||||
|
||||
// 获取WebSocket实例(用于进度推送)
|
||||
const io = (request.server as any).io;
|
||||
|
||||
// 先创建任务记录获取taskId
|
||||
const taskPreview = await prisma.batchTask.create({
|
||||
data: {
|
||||
userId,
|
||||
kbId: kb_id,
|
||||
name: task_name || `批处理任务_${new Date().toLocaleString('zh-CN')}`,
|
||||
templateType: template_type,
|
||||
templateId: template_id || null,
|
||||
prompt: custom_prompt || template_id || '',
|
||||
status: 'processing',
|
||||
totalDocuments: document_ids.length,
|
||||
modelType: model_type,
|
||||
concurrency: 3,
|
||||
startedAt: new Date(),
|
||||
},
|
||||
});
|
||||
|
||||
const taskId = taskPreview.id;
|
||||
console.log(`✅ [BatchController] 创建任务: ${taskId}`);
|
||||
|
||||
// 执行批处理任务(异步)
|
||||
executeBatchTask({
|
||||
userId,
|
||||
kbId: kb_id,
|
||||
documentIds: document_ids,
|
||||
templateType: template_type,
|
||||
templateId: template_id,
|
||||
customPrompt: custom_prompt,
|
||||
modelType: model_type,
|
||||
taskName: task_name,
|
||||
existingTaskId: taskId, // 使用已创建的任务ID
|
||||
onProgress: (progress: BatchProgress) => {
|
||||
// WebSocket推送进度
|
||||
if (io) {
|
||||
io.to(userId).emit('batch-progress', progress);
|
||||
}
|
||||
},
|
||||
})
|
||||
.then((result) => {
|
||||
console.log(`🎉 [BatchController] 批处理任务完成: ${result.taskId}`);
|
||||
// 推送完成事件
|
||||
if (io) {
|
||||
io.to(userId).emit('batch-completed', {
|
||||
task_id: result.taskId,
|
||||
status: result.status,
|
||||
});
|
||||
}
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error(`❌ [BatchController] 批处理任务失败:`, error);
|
||||
// 推送失败事件
|
||||
if (io) {
|
||||
io.to(userId).emit('batch-failed', {
|
||||
task_id: 'unknown',
|
||||
error: error.message,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// 立即返回任务ID(任务在后台执行)
|
||||
reply.send({
|
||||
success: true,
|
||||
message: '批处理任务已开始',
|
||||
data: {
|
||||
task_id: taskId,
|
||||
status: 'processing',
|
||||
websocket_event: 'batch-progress',
|
||||
},
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('❌ [BatchController] 执行批处理失败:', error);
|
||||
reply.code(500).send({
|
||||
success: false,
|
||||
message: error.message || '执行批处理任务失败',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* GET /api/v1/batch/tasks/:taskId
|
||||
* 获取任务状态
|
||||
*/
|
||||
export async function getTask(
|
||||
request: FastifyRequest<{ Params: TaskIdParams }>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { taskId } = request.params;
|
||||
|
||||
const task = await prisma.batchTask.findUnique({
|
||||
where: { id: taskId },
|
||||
select: {
|
||||
id: true,
|
||||
name: true,
|
||||
status: true,
|
||||
totalDocuments: true,
|
||||
completedCount: true,
|
||||
failedCount: true,
|
||||
modelType: true,
|
||||
startedAt: true,
|
||||
completedAt: true,
|
||||
durationSeconds: true,
|
||||
createdAt: true,
|
||||
},
|
||||
});
|
||||
|
||||
if (!task) {
|
||||
return reply.code(404).send({
|
||||
success: false,
|
||||
message: `任务不存在: ${taskId}`,
|
||||
});
|
||||
}
|
||||
|
||||
reply.send({
|
||||
success: true,
|
||||
data: {
|
||||
id: task.id,
|
||||
name: task.name,
|
||||
status: task.status,
|
||||
total_documents: task.totalDocuments,
|
||||
completed_count: task.completedCount,
|
||||
failed_count: task.failedCount,
|
||||
model_type: task.modelType,
|
||||
started_at: task.startedAt,
|
||||
completed_at: task.completedAt,
|
||||
duration_seconds: task.durationSeconds,
|
||||
created_at: task.createdAt,
|
||||
},
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('❌ [BatchController] 获取任务失败:', error);
|
||||
reply.code(500).send({
|
||||
success: false,
|
||||
message: error.message || '获取任务失败',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* GET /api/v1/batch/tasks/:taskId/results
|
||||
* 获取任务结果
|
||||
*/
|
||||
export async function getTaskResults(
|
||||
request: FastifyRequest<{ Params: TaskIdParams }>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { taskId } = request.params;
|
||||
|
||||
// 获取任务信息
|
||||
const task = await prisma.batchTask.findUnique({
|
||||
where: { id: taskId },
|
||||
include: {
|
||||
results: {
|
||||
include: {
|
||||
document: {
|
||||
select: {
|
||||
filename: true,
|
||||
tokensCount: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
orderBy: {
|
||||
createdAt: 'asc',
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
if (!task) {
|
||||
return reply.code(404).send({
|
||||
success: false,
|
||||
message: `任务不存在: ${taskId}`,
|
||||
});
|
||||
}
|
||||
|
||||
// 格式化结果
|
||||
const results = task.results.map((r, index) => ({
|
||||
id: r.id,
|
||||
index: index + 1,
|
||||
document_id: r.documentId,
|
||||
document_name: r.document.filename,
|
||||
status: r.status,
|
||||
data: r.data,
|
||||
raw_output: r.rawOutput,
|
||||
error_message: r.errorMessage,
|
||||
processing_time_ms: r.processingTimeMs,
|
||||
tokens_used: r.tokensUsed,
|
||||
created_at: r.createdAt,
|
||||
}));
|
||||
|
||||
reply.send({
|
||||
success: true,
|
||||
data: {
|
||||
task: {
|
||||
id: task.id,
|
||||
name: task.name,
|
||||
status: task.status,
|
||||
template_type: task.templateType,
|
||||
template_id: task.templateId,
|
||||
total_documents: task.totalDocuments,
|
||||
completed_count: task.completedCount,
|
||||
failed_count: task.failedCount,
|
||||
duration_seconds: task.durationSeconds,
|
||||
created_at: task.createdAt,
|
||||
completed_at: task.completedAt,
|
||||
},
|
||||
results,
|
||||
},
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('❌ [BatchController] 获取任务结果失败:', error);
|
||||
reply.code(500).send({
|
||||
success: false,
|
||||
message: error.message || '获取任务结果失败',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* POST /api/v1/batch/tasks/:taskId/retry-failed
|
||||
* 重试失败的文档
|
||||
*/
|
||||
export async function retryFailed(
|
||||
request: FastifyRequest<{ Params: TaskIdParams }>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { taskId } = request.params;
|
||||
const userId = 'user-mock-001'; // TODO: 从JWT获取
|
||||
|
||||
// 获取WebSocket实例
|
||||
const io = (request.server as any).io;
|
||||
|
||||
// 执行重试(异步)
|
||||
retryFailedDocuments(taskId, (progress: BatchProgress) => {
|
||||
if (io) {
|
||||
io.to(userId).emit('batch-progress', progress);
|
||||
}
|
||||
})
|
||||
.then((result) => {
|
||||
console.log(`✅ [BatchController] 重试完成: ${result.retriedCount}篇`);
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error(`❌ [BatchController] 重试失败:`, error);
|
||||
});
|
||||
|
||||
reply.send({
|
||||
success: true,
|
||||
message: '已开始重试失败的文档',
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('❌ [BatchController] 重试失败:', error);
|
||||
reply.code(500).send({
|
||||
success: false,
|
||||
message: error.message || '重试失败',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* GET /api/v1/batch/templates
|
||||
* 获取所有预设模板
|
||||
*/
|
||||
export async function getTemplates(
|
||||
request: FastifyRequest,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { getAllTemplates } = await import('../templates/clinicalResearch.js');
|
||||
const templates = getAllTemplates();
|
||||
|
||||
reply.send({
|
||||
success: true,
|
||||
data: templates.map(t => ({
|
||||
id: t.id,
|
||||
name: t.name,
|
||||
description: t.description,
|
||||
output_fields: t.outputFields,
|
||||
})),
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('❌ [BatchController] 获取模板失败:', error);
|
||||
reply.code(500).send({
|
||||
success: false,
|
||||
message: error.message || '获取模板失败',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
314
backend/src/modules/pkb/controllers/documentController.ts
Normal file
314
backend/src/modules/pkb/controllers/documentController.ts
Normal file
@@ -0,0 +1,314 @@
|
||||
import type { FastifyRequest, FastifyReply } from 'fastify';
|
||||
import * as documentService from '../services/documentService.js';
|
||||
|
||||
// Mock用户ID(实际应从JWT token中获取)
|
||||
const MOCK_USER_ID = 'user-mock-001';
|
||||
|
||||
/**
|
||||
* 上传文档
|
||||
*/
|
||||
export async function uploadDocument(
|
||||
request: FastifyRequest<{
|
||||
Params: {
|
||||
kbId: string;
|
||||
};
|
||||
}>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { kbId } = request.params;
|
||||
console.log(`📤 开始上传文档到知识库: ${kbId}`);
|
||||
|
||||
// 获取上传的文件
|
||||
const data = await request.file();
|
||||
|
||||
if (!data) {
|
||||
console.error('❌ 没有接收到文件');
|
||||
return reply.status(400).send({
|
||||
success: false,
|
||||
message: 'No file uploaded',
|
||||
});
|
||||
}
|
||||
|
||||
console.log(`📄 接收到文件: ${data.filename}, 类型: ${data.mimetype}`);
|
||||
|
||||
const file = await data.toBuffer();
|
||||
const filename = data.filename;
|
||||
const fileType = data.mimetype;
|
||||
const fileSizeBytes = file.length;
|
||||
|
||||
// 文件大小限制(10MB)
|
||||
const maxSize = 10 * 1024 * 1024;
|
||||
console.log(`📊 文件大小: ${(fileSizeBytes / 1024 / 1024).toFixed(2)}MB (限制: 10MB)`);
|
||||
|
||||
if (fileSizeBytes > maxSize) {
|
||||
console.error(`❌ 文件太大: ${(fileSizeBytes / 1024 / 1024).toFixed(2)}MB`);
|
||||
return reply.status(400).send({
|
||||
success: false,
|
||||
message: 'File size exceeds 10MB limit',
|
||||
});
|
||||
}
|
||||
|
||||
// 文件类型限制
|
||||
const allowedTypes = [
|
||||
'application/pdf',
|
||||
'application/msword',
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'text/plain',
|
||||
'text/markdown',
|
||||
];
|
||||
|
||||
console.log(`🔍 检查文件类型: ${fileType}`);
|
||||
if (!allowedTypes.includes(fileType)) {
|
||||
console.error(`❌ 不支持的文件类型: ${fileType}`);
|
||||
return reply.status(400).send({
|
||||
success: false,
|
||||
message: 'File type not supported. Allowed: PDF, DOC, DOCX, TXT, MD',
|
||||
});
|
||||
}
|
||||
|
||||
// 上传文档(这里fileUrl暂时为空,实际应该上传到对象存储)
|
||||
console.log(`⚙️ 调用文档服务上传文件...`);
|
||||
const document = await documentService.uploadDocument(
|
||||
MOCK_USER_ID,
|
||||
kbId,
|
||||
file,
|
||||
filename,
|
||||
fileType,
|
||||
fileSizeBytes,
|
||||
'' // fileUrl - 可以上传到OSS后填入
|
||||
);
|
||||
|
||||
console.log(`✅ 文档上传成功: ${document.id}`);
|
||||
return reply.status(201).send({
|
||||
success: true,
|
||||
data: document,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('❌ 文档上传失败:', error.message);
|
||||
console.error('错误详情:', error);
|
||||
|
||||
if (error.message.includes('not found') || error.message.includes('access denied')) {
|
||||
return reply.status(404).send({
|
||||
success: false,
|
||||
message: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
if (error.message.includes('limit exceeded')) {
|
||||
return reply.status(400).send({
|
||||
success: false,
|
||||
message: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
return reply.status(500).send({
|
||||
success: false,
|
||||
message: error.message || 'Failed to upload document',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取文档列表
|
||||
*/
|
||||
export async function getDocuments(
|
||||
request: FastifyRequest<{
|
||||
Params: {
|
||||
kbId: string;
|
||||
};
|
||||
}>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { kbId } = request.params;
|
||||
|
||||
const documents = await documentService.getDocuments(MOCK_USER_ID, kbId);
|
||||
|
||||
return reply.send({
|
||||
success: true,
|
||||
data: documents,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Failed to get documents:', error);
|
||||
|
||||
if (error.message.includes('not found')) {
|
||||
return reply.status(404).send({
|
||||
success: false,
|
||||
message: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
return reply.status(500).send({
|
||||
success: false,
|
||||
message: error.message || 'Failed to get documents',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取文档详情
|
||||
*/
|
||||
export async function getDocumentById(
|
||||
request: FastifyRequest<{
|
||||
Params: {
|
||||
id: string;
|
||||
};
|
||||
}>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { id } = request.params;
|
||||
|
||||
const document = await documentService.getDocumentById(MOCK_USER_ID, id);
|
||||
|
||||
return reply.send({
|
||||
success: true,
|
||||
data: document,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Failed to get document:', error);
|
||||
|
||||
if (error.message.includes('not found')) {
|
||||
return reply.status(404).send({
|
||||
success: false,
|
||||
message: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
return reply.status(500).send({
|
||||
success: false,
|
||||
message: error.message || 'Failed to get document',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除文档
|
||||
*/
|
||||
export async function deleteDocument(
|
||||
request: FastifyRequest<{
|
||||
Params: {
|
||||
id: string;
|
||||
};
|
||||
}>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { id } = request.params;
|
||||
|
||||
await documentService.deleteDocument(MOCK_USER_ID, id);
|
||||
|
||||
return reply.send({
|
||||
success: true,
|
||||
message: 'Document deleted successfully',
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Failed to delete document:', error);
|
||||
|
||||
if (error.message.includes('not found')) {
|
||||
return reply.status(404).send({
|
||||
success: false,
|
||||
message: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
return reply.status(500).send({
|
||||
success: false,
|
||||
message: error.message || 'Failed to delete document',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 重新处理文档
|
||||
*/
|
||||
export async function reprocessDocument(
|
||||
request: FastifyRequest<{
|
||||
Params: {
|
||||
id: string;
|
||||
};
|
||||
}>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { id } = request.params;
|
||||
|
||||
await documentService.reprocessDocument(MOCK_USER_ID, id);
|
||||
|
||||
return reply.send({
|
||||
success: true,
|
||||
message: 'Document reprocessing started',
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Failed to reprocess document:', error);
|
||||
|
||||
if (error.message.includes('not found')) {
|
||||
return reply.status(404).send({
|
||||
success: false,
|
||||
message: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
return reply.status(500).send({
|
||||
success: false,
|
||||
message: error.message || 'Failed to reprocess document',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Phase 2: 获取文档全文(用于逐篇精读模式)
|
||||
*/
|
||||
export async function getDocumentFullText(
|
||||
request: FastifyRequest<{
|
||||
Params: {
|
||||
id: string;
|
||||
};
|
||||
}>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { id } = request.params;
|
||||
|
||||
const document = await documentService.getDocumentById(MOCK_USER_ID, id);
|
||||
|
||||
// 返回完整的文档信息
|
||||
return reply.send({
|
||||
success: true,
|
||||
data: {
|
||||
documentId: document.id,
|
||||
filename: document.filename,
|
||||
fileType: document.fileType,
|
||||
fileSizeBytes: document.fileSizeBytes,
|
||||
extractedText: (document as any).extractedText || null,
|
||||
charCount: (document as any).charCount || null,
|
||||
tokensCount: document.tokensCount || null,
|
||||
extractionMethod: (document as any).extractionMethod || null,
|
||||
extractionQuality: (document as any).extractionQuality || null,
|
||||
language: (document as any).language || null,
|
||||
metadata: {
|
||||
uploadedAt: document.uploadedAt,
|
||||
processedAt: document.processedAt,
|
||||
status: document.status,
|
||||
},
|
||||
},
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Failed to get document full text:', error);
|
||||
|
||||
if (error.message.includes('not found')) {
|
||||
return reply.status(404).send({
|
||||
success: false,
|
||||
message: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
return reply.status(500).send({
|
||||
success: false,
|
||||
message: error.message || 'Failed to get document full text',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
341
backend/src/modules/pkb/controllers/knowledgeBaseController.ts
Normal file
341
backend/src/modules/pkb/controllers/knowledgeBaseController.ts
Normal file
@@ -0,0 +1,341 @@
|
||||
import type { FastifyRequest, FastifyReply } from 'fastify';
|
||||
import * as knowledgeBaseService from '../services/knowledgeBaseService.js';
|
||||
|
||||
// Mock用户ID(实际应从JWT token中获取)
|
||||
const MOCK_USER_ID = 'user-mock-001';
|
||||
|
||||
/**
|
||||
* 创建知识库
|
||||
*/
|
||||
export async function createKnowledgeBase(
|
||||
request: FastifyRequest<{
|
||||
Body: {
|
||||
name: string;
|
||||
description?: string;
|
||||
};
|
||||
}>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { name, description } = request.body;
|
||||
|
||||
if (!name || name.trim().length === 0) {
|
||||
return reply.status(400).send({
|
||||
success: false,
|
||||
message: 'Knowledge base name is required',
|
||||
});
|
||||
}
|
||||
|
||||
const knowledgeBase = await knowledgeBaseService.createKnowledgeBase(
|
||||
MOCK_USER_ID,
|
||||
name,
|
||||
description
|
||||
);
|
||||
|
||||
return reply.status(201).send({
|
||||
success: true,
|
||||
data: knowledgeBase,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Failed to create knowledge base:', error);
|
||||
return reply.status(500).send({
|
||||
success: false,
|
||||
message: error.message || 'Failed to create knowledge base',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取知识库列表
|
||||
*/
|
||||
export async function getKnowledgeBases(
|
||||
_request: FastifyRequest,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const knowledgeBases = await knowledgeBaseService.getKnowledgeBases(
|
||||
MOCK_USER_ID
|
||||
);
|
||||
|
||||
return reply.send({
|
||||
success: true,
|
||||
data: knowledgeBases,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Failed to get knowledge bases:', error);
|
||||
return reply.status(500).send({
|
||||
success: false,
|
||||
message: error.message || 'Failed to get knowledge bases',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取知识库详情
|
||||
*/
|
||||
export async function getKnowledgeBaseById(
|
||||
request: FastifyRequest<{
|
||||
Params: {
|
||||
id: string;
|
||||
};
|
||||
}>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { id } = request.params;
|
||||
|
||||
const knowledgeBase = await knowledgeBaseService.getKnowledgeBaseById(
|
||||
MOCK_USER_ID,
|
||||
id
|
||||
);
|
||||
|
||||
return reply.send({
|
||||
success: true,
|
||||
data: knowledgeBase,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Failed to get knowledge base:', error);
|
||||
|
||||
if (error.message.includes('not found')) {
|
||||
return reply.status(404).send({
|
||||
success: false,
|
||||
message: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
return reply.status(500).send({
|
||||
success: false,
|
||||
message: error.message || 'Failed to get knowledge base',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 更新知识库
|
||||
*/
|
||||
export async function updateKnowledgeBase(
|
||||
request: FastifyRequest<{
|
||||
Params: {
|
||||
id: string;
|
||||
};
|
||||
Body: {
|
||||
name?: string;
|
||||
description?: string;
|
||||
};
|
||||
}>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { id } = request.params;
|
||||
const updateData = request.body;
|
||||
|
||||
const knowledgeBase = await knowledgeBaseService.updateKnowledgeBase(
|
||||
MOCK_USER_ID,
|
||||
id,
|
||||
updateData
|
||||
);
|
||||
|
||||
return reply.send({
|
||||
success: true,
|
||||
data: knowledgeBase,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Failed to update knowledge base:', error);
|
||||
|
||||
if (error.message.includes('not found')) {
|
||||
return reply.status(404).send({
|
||||
success: false,
|
||||
message: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
return reply.status(500).send({
|
||||
success: false,
|
||||
message: error.message || 'Failed to update knowledge base',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除知识库
|
||||
*/
|
||||
export async function deleteKnowledgeBase(
|
||||
request: FastifyRequest<{
|
||||
Params: {
|
||||
id: string;
|
||||
};
|
||||
}>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { id } = request.params;
|
||||
|
||||
await knowledgeBaseService.deleteKnowledgeBase(MOCK_USER_ID, id);
|
||||
|
||||
return reply.send({
|
||||
success: true,
|
||||
message: 'Knowledge base deleted successfully',
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Failed to delete knowledge base:', error);
|
||||
|
||||
if (error.message.includes('not found')) {
|
||||
return reply.status(404).send({
|
||||
success: false,
|
||||
message: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
return reply.status(500).send({
|
||||
success: false,
|
||||
message: error.message || 'Failed to delete knowledge base',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 检索知识库
|
||||
*/
|
||||
export async function searchKnowledgeBase(
|
||||
request: FastifyRequest<{
|
||||
Params: {
|
||||
id: string;
|
||||
};
|
||||
Querystring: {
|
||||
query: string;
|
||||
top_k?: string;
|
||||
};
|
||||
}>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { id } = request.params;
|
||||
const { query, top_k } = request.query;
|
||||
|
||||
if (!query || query.trim().length === 0) {
|
||||
return reply.status(400).send({
|
||||
success: false,
|
||||
message: 'Query parameter is required',
|
||||
});
|
||||
}
|
||||
|
||||
const topK = top_k ? parseInt(top_k, 10) : 15; // Phase 1优化:默认从3增加到15
|
||||
|
||||
const results = await knowledgeBaseService.searchKnowledgeBase(
|
||||
MOCK_USER_ID,
|
||||
id,
|
||||
query,
|
||||
topK
|
||||
);
|
||||
|
||||
return reply.send({
|
||||
success: true,
|
||||
data: results,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Failed to search knowledge base:', error);
|
||||
|
||||
if (error.message.includes('not found')) {
|
||||
return reply.status(404).send({
|
||||
success: false,
|
||||
message: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
return reply.status(500).send({
|
||||
success: false,
|
||||
message: error.message || 'Failed to search knowledge base',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取知识库统计信息
|
||||
*/
|
||||
export async function getKnowledgeBaseStats(
|
||||
request: FastifyRequest<{
|
||||
Params: {
|
||||
id: string;
|
||||
};
|
||||
}>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { id } = request.params;
|
||||
|
||||
const stats = await knowledgeBaseService.getKnowledgeBaseStats(
|
||||
MOCK_USER_ID,
|
||||
id
|
||||
);
|
||||
|
||||
return reply.send({
|
||||
success: true,
|
||||
data: stats,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Failed to get knowledge base stats:', error);
|
||||
|
||||
if (error.message.includes('not found')) {
|
||||
return reply.status(404).send({
|
||||
success: false,
|
||||
message: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
return reply.status(500).send({
|
||||
success: false,
|
||||
message: error.message || 'Failed to get knowledge base stats',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取知识库文档选择(Phase 2: 全文阅读模式)
|
||||
*/
|
||||
export async function getDocumentSelection(
|
||||
request: FastifyRequest<{
|
||||
Params: {
|
||||
id: string;
|
||||
};
|
||||
Querystring: {
|
||||
max_files?: string;
|
||||
max_tokens?: string;
|
||||
};
|
||||
}>,
|
||||
reply: FastifyReply
|
||||
) {
|
||||
try {
|
||||
const { id } = request.params;
|
||||
const { max_files, max_tokens } = request.query;
|
||||
|
||||
const maxFiles = max_files ? parseInt(max_files, 10) : undefined;
|
||||
const maxTokens = max_tokens ? parseInt(max_tokens, 10) : undefined;
|
||||
|
||||
const selection = await knowledgeBaseService.getDocumentSelection(
|
||||
MOCK_USER_ID,
|
||||
id,
|
||||
maxFiles,
|
||||
maxTokens
|
||||
);
|
||||
|
||||
return reply.send({
|
||||
success: true,
|
||||
data: selection,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Failed to get document selection:', error);
|
||||
|
||||
if (error.message.includes('not found')) {
|
||||
return reply.status(404).send({
|
||||
success: false,
|
||||
message: error.message,
|
||||
});
|
||||
}
|
||||
|
||||
return reply.status(500).send({
|
||||
success: false,
|
||||
message: error.message || 'Failed to get document selection',
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
12
backend/src/modules/pkb/index.ts
Normal file
12
backend/src/modules/pkb/index.ts
Normal file
@@ -0,0 +1,12 @@
|
||||
/**
|
||||
* PKB(个人知识库)模块入口
|
||||
*
|
||||
* 功能:
|
||||
* - 知识库CRUD
|
||||
* - 文档上传和管理
|
||||
* - RAG检索
|
||||
* - 批处理任务
|
||||
*/
|
||||
|
||||
export { default as pkbRoutes } from './routes/index.js';
|
||||
|
||||
38
backend/src/modules/pkb/routes/batchRoutes.ts
Normal file
38
backend/src/modules/pkb/routes/batchRoutes.ts
Normal file
@@ -0,0 +1,38 @@
|
||||
/**
|
||||
* Phase 3: 批处理模式 - 路由配置
|
||||
*/
|
||||
|
||||
import { FastifyInstance } from 'fastify';
|
||||
import {
|
||||
executeBatch,
|
||||
getTask,
|
||||
getTaskResults,
|
||||
retryFailed,
|
||||
getTemplates,
|
||||
} from '../controllers/batchController.js';
|
||||
|
||||
export default async function batchRoutes(fastify: FastifyInstance) {
|
||||
// 执行批处理任务
|
||||
fastify.post('/batch/execute', executeBatch);
|
||||
|
||||
// 获取任务状态
|
||||
fastify.get('/batch/tasks/:taskId', getTask);
|
||||
|
||||
// 获取任务结果
|
||||
fastify.get('/batch/tasks/:taskId/results', getTaskResults);
|
||||
|
||||
// 重试失败的文档
|
||||
fastify.post('/batch/tasks/:taskId/retry-failed', retryFailed);
|
||||
|
||||
// 获取所有预设模板
|
||||
fastify.get('/batch/templates', getTemplates);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
45
backend/src/modules/pkb/routes/health.ts
Normal file
45
backend/src/modules/pkb/routes/health.ts
Normal file
@@ -0,0 +1,45 @@
|
||||
/**
|
||||
* PKB模块健康检查路由
|
||||
*/
|
||||
import { FastifyInstance, FastifyRequest, FastifyReply } from 'fastify';
|
||||
import { prisma } from '../../../config/database.js';
|
||||
|
||||
export default async function healthRoutes(fastify: FastifyInstance) {
|
||||
// PKB模块健康检查
|
||||
fastify.get('/health', async (request: FastifyRequest, reply: FastifyReply) => {
|
||||
try {
|
||||
// 检查数据库连接
|
||||
await prisma.$queryRaw`SELECT 1`;
|
||||
|
||||
// 检查pkb_schema是否可访问
|
||||
const kbCount = await prisma.knowledgeBase.count();
|
||||
|
||||
return reply.send({
|
||||
status: 'ok',
|
||||
module: 'pkb',
|
||||
version: 'v2',
|
||||
timestamp: new Date().toISOString(),
|
||||
database: {
|
||||
connected: true,
|
||||
schema: 'pkb_schema',
|
||||
knowledgeBases: kbCount,
|
||||
},
|
||||
message: 'PKB模块运行正常',
|
||||
});
|
||||
} catch (error: any) {
|
||||
return reply.status(503).send({
|
||||
status: 'error',
|
||||
module: 'pkb',
|
||||
version: 'v2',
|
||||
timestamp: new Date().toISOString(),
|
||||
database: {
|
||||
connected: false,
|
||||
error: error.message,
|
||||
},
|
||||
message: 'PKB模块健康检查失败',
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
19
backend/src/modules/pkb/routes/index.ts
Normal file
19
backend/src/modules/pkb/routes/index.ts
Normal file
@@ -0,0 +1,19 @@
|
||||
/**
|
||||
* PKB模块路由入口
|
||||
*/
|
||||
import { FastifyInstance } from 'fastify';
|
||||
import knowledgeBaseRoutes from './knowledgeBases.js';
|
||||
import batchRoutes from './batchRoutes.js';
|
||||
import healthRoutes from './health.js';
|
||||
|
||||
export default async function pkbRoutes(fastify: FastifyInstance) {
|
||||
// 健康检查路由
|
||||
fastify.register(healthRoutes);
|
||||
|
||||
// 注册知识库路由
|
||||
fastify.register(knowledgeBaseRoutes, { prefix: '/knowledge' });
|
||||
|
||||
// 注册批处理路由
|
||||
fastify.register(batchRoutes, { prefix: '/batch-tasks' });
|
||||
}
|
||||
|
||||
53
backend/src/modules/pkb/routes/knowledgeBases.ts
Normal file
53
backend/src/modules/pkb/routes/knowledgeBases.ts
Normal file
@@ -0,0 +1,53 @@
|
||||
import type { FastifyInstance } from 'fastify';
|
||||
import * as knowledgeBaseController from '../controllers/knowledgeBaseController.js';
|
||||
import * as documentController from '../controllers/documentController.js';
|
||||
|
||||
export default async function knowledgeBaseRoutes(fastify: FastifyInstance) {
|
||||
// ==================== 知识库管理 API ====================
|
||||
|
||||
// 创建知识库
|
||||
fastify.post('/knowledge-bases', knowledgeBaseController.createKnowledgeBase);
|
||||
|
||||
// 获取知识库列表
|
||||
fastify.get('/knowledge-bases', knowledgeBaseController.getKnowledgeBases);
|
||||
|
||||
// 获取知识库详情
|
||||
fastify.get('/knowledge-bases/:id', knowledgeBaseController.getKnowledgeBaseById);
|
||||
|
||||
// 更新知识库
|
||||
fastify.put('/knowledge-bases/:id', knowledgeBaseController.updateKnowledgeBase);
|
||||
|
||||
// 删除知识库
|
||||
fastify.delete('/knowledge-bases/:id', knowledgeBaseController.deleteKnowledgeBase);
|
||||
|
||||
// 检索知识库
|
||||
fastify.get('/knowledge-bases/:id/search', knowledgeBaseController.searchKnowledgeBase);
|
||||
|
||||
// 获取知识库统计信息
|
||||
fastify.get('/knowledge-bases/:id/stats', knowledgeBaseController.getKnowledgeBaseStats);
|
||||
|
||||
// Phase 2: 获取文档选择(全文阅读模式)
|
||||
fastify.get('/knowledge-bases/:id/document-selection', knowledgeBaseController.getDocumentSelection);
|
||||
|
||||
// ==================== 文档管理 API ====================
|
||||
|
||||
// 上传文档
|
||||
fastify.post('/knowledge-bases/:kbId/documents', documentController.uploadDocument);
|
||||
|
||||
// 获取文档列表
|
||||
fastify.get('/knowledge-bases/:kbId/documents', documentController.getDocuments);
|
||||
|
||||
// 获取文档详情
|
||||
fastify.get('/documents/:id', documentController.getDocumentById);
|
||||
|
||||
// Phase 2: 获取文档全文
|
||||
fastify.get('/documents/:id/full-text', documentController.getDocumentFullText);
|
||||
|
||||
// 删除文档
|
||||
fastify.delete('/documents/:id', documentController.deleteDocument);
|
||||
|
||||
// 重新处理文档
|
||||
fastify.post('/documents/:id/reprocess', documentController.reprocessDocument);
|
||||
}
|
||||
|
||||
|
||||
420
backend/src/modules/pkb/services/batchService.ts
Normal file
420
backend/src/modules/pkb/services/batchService.ts
Normal file
@@ -0,0 +1,420 @@
|
||||
/**
|
||||
* Phase 3: 批处理模式 - 批处理服务
|
||||
*
|
||||
* 核心功能:
|
||||
* 1. 执行批处理任务(3并发)
|
||||
* 2. 处理单个文档
|
||||
* 3. 进度推送(WebSocket)
|
||||
* 4. 错误处理和重试
|
||||
*/
|
||||
|
||||
import PQueue from 'p-queue';
|
||||
import { prisma } from '../../../config/database.js';
|
||||
import { LLMFactory } from '../../../common/llm/adapters/LLMFactory.js';
|
||||
import { ModelType } from '../../../common/llm/adapters/types.js';
|
||||
import { getTemplate } from '../../../legacy/templates/clinicalResearch.js';
|
||||
import { parseJSON } from '../../../common/utils/jsonParser.js';
|
||||
|
||||
export interface ExecuteBatchTaskParams {
|
||||
userId: string;
|
||||
kbId: string;
|
||||
documentIds: string[];
|
||||
templateType: 'preset' | 'custom';
|
||||
templateId?: string; // 预设模板ID
|
||||
customPrompt?: string; // 自定义提示词
|
||||
modelType: ModelType;
|
||||
taskName?: string;
|
||||
existingTaskId?: string; // 已存在的任务ID(可选)
|
||||
onProgress?: (progress: BatchProgress) => void;
|
||||
}
|
||||
|
||||
export interface BatchProgress {
|
||||
taskId: string;
|
||||
completed: number;
|
||||
total: number;
|
||||
failed: number;
|
||||
currentDocument?: string;
|
||||
estimatedSeconds?: number;
|
||||
}
|
||||
|
||||
export interface BatchTaskResult {
|
||||
taskId: string;
|
||||
status: 'processing' | 'completed' | 'failed';
|
||||
totalDocuments: number;
|
||||
completedCount: number;
|
||||
failedCount: number;
|
||||
durationSeconds?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* 执行批处理任务
|
||||
*/
|
||||
export async function executeBatchTask(
|
||||
params: ExecuteBatchTaskParams
|
||||
): Promise<BatchTaskResult> {
|
||||
const {
|
||||
userId,
|
||||
kbId,
|
||||
documentIds,
|
||||
templateType,
|
||||
templateId,
|
||||
customPrompt,
|
||||
modelType,
|
||||
taskName,
|
||||
existingTaskId,
|
||||
onProgress,
|
||||
} = params;
|
||||
|
||||
console.log('📦 [BatchService] 开始执行批处理任务', {
|
||||
userId,
|
||||
kbId,
|
||||
documentCount: documentIds.length,
|
||||
templateType,
|
||||
modelType,
|
||||
existingTaskId: existingTaskId || '新建',
|
||||
});
|
||||
|
||||
// 验证文献数量 (3-50篇)
|
||||
if (documentIds.length < 3 || documentIds.length > 50) {
|
||||
throw new Error(`文献数量必须在3-50篇之间,当前:${documentIds.length}篇`);
|
||||
}
|
||||
|
||||
// 获取模板或使用自定义提示词
|
||||
let systemPrompt: string;
|
||||
let userPromptTemplate: string;
|
||||
let expectedFields: string[] = [];
|
||||
|
||||
if (templateType === 'preset') {
|
||||
if (!templateId) {
|
||||
throw new Error('预设模板类型需要提供templateId');
|
||||
}
|
||||
|
||||
const template = getTemplate(templateId);
|
||||
if (!template) {
|
||||
throw new Error(`未找到模板: ${templateId}`);
|
||||
}
|
||||
|
||||
systemPrompt = template.systemPrompt;
|
||||
userPromptTemplate = template.userPrompt;
|
||||
expectedFields = template.outputFields.map(f => f.key);
|
||||
} else {
|
||||
// 自定义模板
|
||||
if (!customPrompt) {
|
||||
throw new Error('自定义模板需要提供customPrompt');
|
||||
}
|
||||
|
||||
systemPrompt = '你是一个专业的文献分析助手。请根据用户的要求分析文献内容。';
|
||||
userPromptTemplate = customPrompt;
|
||||
}
|
||||
|
||||
// 使用已存在的任务或创建新任务
|
||||
let task;
|
||||
if (existingTaskId) {
|
||||
task = await prisma.batchTask.findUnique({
|
||||
where: { id: existingTaskId },
|
||||
});
|
||||
if (!task) {
|
||||
throw new Error(`任务不存在: ${existingTaskId}`);
|
||||
}
|
||||
console.log(`✅ [BatchService] 使用已存在的任务: ${task.id}`);
|
||||
} else {
|
||||
task = await prisma.batchTask.create({
|
||||
data: {
|
||||
userId,
|
||||
kbId,
|
||||
name: taskName || `批处理任务_${new Date().toLocaleString('zh-CN')}`,
|
||||
templateType,
|
||||
templateId: templateId || null,
|
||||
prompt: userPromptTemplate,
|
||||
status: 'processing',
|
||||
totalDocuments: documentIds.length,
|
||||
completedCount: 0,
|
||||
failedCount: 0,
|
||||
modelType,
|
||||
concurrency: 3, // 固定3并发
|
||||
startedAt: new Date(),
|
||||
},
|
||||
});
|
||||
console.log(`✅ [BatchService] 创建任务记录: ${task.id}`);
|
||||
}
|
||||
|
||||
const startTime = Date.now();
|
||||
let completedCount = 0;
|
||||
let failedCount = 0;
|
||||
|
||||
// 创建并发队列(固定3并发)
|
||||
const queue = new PQueue({ concurrency: 3 });
|
||||
|
||||
// 处理所有文档
|
||||
const promises = documentIds.map((docId, index) =>
|
||||
queue.add(async () => {
|
||||
try {
|
||||
console.log(`🔄 [BatchService] 处理文档 ${index + 1}/${documentIds.length}: ${docId}`);
|
||||
|
||||
// 获取文档
|
||||
const document = await prisma.document.findUnique({
|
||||
where: { id: docId },
|
||||
select: {
|
||||
id: true,
|
||||
filename: true,
|
||||
extractedText: true,
|
||||
tokensCount: true,
|
||||
},
|
||||
});
|
||||
|
||||
if (!document) {
|
||||
throw new Error(`文档不存在: ${docId}`);
|
||||
}
|
||||
|
||||
if (!document.extractedText) {
|
||||
throw new Error(`文档未提取文本: ${document.filename}`);
|
||||
}
|
||||
|
||||
// 调用LLM处理
|
||||
const result = await processDocument({
|
||||
document: { ...document, extractedText: document.extractedText! } as any,
|
||||
systemPrompt,
|
||||
userPromptTemplate,
|
||||
modelType,
|
||||
templateType,
|
||||
expectedFields,
|
||||
});
|
||||
|
||||
// 保存结果
|
||||
await prisma.batchResult.create({
|
||||
data: {
|
||||
taskId: task.id,
|
||||
documentId: docId,
|
||||
status: 'success',
|
||||
data: result.data,
|
||||
rawOutput: result.rawOutput,
|
||||
processingTimeMs: result.processingTimeMs,
|
||||
tokensUsed: result.tokensUsed,
|
||||
},
|
||||
});
|
||||
|
||||
completedCount++;
|
||||
console.log(`✅ [BatchService] 文档处理成功: ${document.filename} (${result.processingTimeMs}ms)`);
|
||||
|
||||
} catch (error: any) {
|
||||
// 处理失败
|
||||
console.error(`❌ [BatchService] 文档处理失败: ${docId}`, error);
|
||||
|
||||
await prisma.batchResult.create({
|
||||
data: {
|
||||
taskId: task.id,
|
||||
documentId: docId,
|
||||
status: 'failed',
|
||||
errorMessage: error.message,
|
||||
},
|
||||
});
|
||||
|
||||
failedCount++;
|
||||
}
|
||||
|
||||
// 推送进度
|
||||
if (onProgress) {
|
||||
const progress: BatchProgress = {
|
||||
taskId: task.id,
|
||||
completed: completedCount + failedCount,
|
||||
total: documentIds.length,
|
||||
failed: failedCount,
|
||||
estimatedSeconds: calculateEstimatedTime(
|
||||
completedCount + failedCount,
|
||||
documentIds.length,
|
||||
Date.now() - startTime
|
||||
),
|
||||
};
|
||||
onProgress(progress);
|
||||
}
|
||||
|
||||
// 更新任务进度
|
||||
await prisma.batchTask.update({
|
||||
where: { id: task.id },
|
||||
data: {
|
||||
completedCount,
|
||||
failedCount,
|
||||
},
|
||||
});
|
||||
})
|
||||
);
|
||||
|
||||
// 等待所有任务完成
|
||||
await Promise.allSettled(promises);
|
||||
|
||||
// 计算总时长
|
||||
const durationSeconds = Math.round((Date.now() - startTime) / 1000);
|
||||
|
||||
// 更新任务状态
|
||||
await prisma.batchTask.update({
|
||||
where: { id: task.id },
|
||||
data: {
|
||||
status: 'completed',
|
||||
completedAt: new Date(),
|
||||
durationSeconds,
|
||||
},
|
||||
});
|
||||
|
||||
console.log(`🎉 [BatchService] 批处理任务完成: ${task.id}`, {
|
||||
total: documentIds.length,
|
||||
success: completedCount,
|
||||
failed: failedCount,
|
||||
durationSeconds,
|
||||
});
|
||||
|
||||
return {
|
||||
taskId: task.id,
|
||||
status: 'completed',
|
||||
totalDocuments: documentIds.length,
|
||||
completedCount,
|
||||
failedCount,
|
||||
durationSeconds,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* 处理单个文档
|
||||
*/
|
||||
async function processDocument(params: {
|
||||
document: {
|
||||
id: string;
|
||||
filename: string;
|
||||
extractedText: string;
|
||||
tokensCount: number | null;
|
||||
};
|
||||
systemPrompt: string;
|
||||
userPromptTemplate: string;
|
||||
modelType: ModelType;
|
||||
templateType: 'preset' | 'custom';
|
||||
expectedFields: string[];
|
||||
}): Promise<{
|
||||
data: any;
|
||||
rawOutput: string;
|
||||
processingTimeMs: number;
|
||||
tokensUsed?: number;
|
||||
}> {
|
||||
const {
|
||||
document,
|
||||
systemPrompt,
|
||||
userPromptTemplate,
|
||||
modelType,
|
||||
templateType,
|
||||
expectedFields,
|
||||
} = params;
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
// 构造完整的用户消息
|
||||
const userMessage = `${userPromptTemplate}\n\n【文献:${document.filename}】\n\n${document.extractedText}`;
|
||||
|
||||
// 调用LLM
|
||||
const adapter = LLMFactory.getAdapter(modelType);
|
||||
const response = await adapter.chat(
|
||||
[
|
||||
{ role: 'system', content: systemPrompt },
|
||||
{ role: 'user', content: userMessage },
|
||||
],
|
||||
{
|
||||
temperature: 0.3, // 降低温度提高稳定性
|
||||
maxTokens: 2000,
|
||||
}
|
||||
);
|
||||
|
||||
const processingTimeMs = Date.now() - startTime;
|
||||
const rawOutput = response.content;
|
||||
|
||||
// 解析结果
|
||||
let data: any;
|
||||
|
||||
if (templateType === 'preset') {
|
||||
// 预设模板:解析JSON
|
||||
const parseResult = parseJSON(rawOutput, expectedFields);
|
||||
|
||||
if (!parseResult.success) {
|
||||
throw new Error(`JSON解析失败: ${parseResult.error}`);
|
||||
}
|
||||
|
||||
data = parseResult.data;
|
||||
} else {
|
||||
// 自定义模板:直接使用文本
|
||||
data = {
|
||||
extracted_text: rawOutput,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
data,
|
||||
rawOutput,
|
||||
processingTimeMs,
|
||||
tokensUsed: response.usage?.totalTokens,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* 计算预估剩余时间
|
||||
*/
|
||||
function calculateEstimatedTime(
|
||||
completed: number,
|
||||
total: number,
|
||||
elapsedMs: number
|
||||
): number {
|
||||
if (completed === 0) return 0;
|
||||
|
||||
const avgTimePerDoc = elapsedMs / completed;
|
||||
const remaining = total - completed;
|
||||
return Math.round((avgTimePerDoc * remaining) / 1000);
|
||||
}
|
||||
|
||||
/**
|
||||
* 重试失败的文档
|
||||
*/
|
||||
export async function retryFailedDocuments(
|
||||
taskId: string,
|
||||
onProgress?: (progress: BatchProgress) => void
|
||||
): Promise<{ retriedCount: number }> {
|
||||
console.log(`🔄 [BatchService] 重试失败文档: ${taskId}`);
|
||||
|
||||
// 获取任务信息
|
||||
const task = await prisma.batchTask.findUnique({
|
||||
where: { id: taskId },
|
||||
include: {
|
||||
results: {
|
||||
where: { status: 'failed' },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
if (!task) {
|
||||
throw new Error(`任务不存在: ${taskId}`);
|
||||
}
|
||||
|
||||
const failedDocIds = task.results.map(r => r.documentId);
|
||||
|
||||
if (failedDocIds.length === 0) {
|
||||
return { retriedCount: 0 };
|
||||
}
|
||||
|
||||
// 删除旧的失败记录
|
||||
await prisma.batchResult.deleteMany({
|
||||
where: {
|
||||
taskId,
|
||||
status: 'failed',
|
||||
},
|
||||
});
|
||||
|
||||
// 重新执行
|
||||
await executeBatchTask({
|
||||
userId: task.userId,
|
||||
kbId: task.kbId,
|
||||
documentIds: failedDocIds,
|
||||
templateType: task.templateType as 'preset' | 'custom',
|
||||
templateId: task.templateId || undefined,
|
||||
customPrompt: task.templateType === 'custom' ? task.prompt : undefined,
|
||||
modelType: task.modelType as ModelType,
|
||||
taskName: `${task.name} (重试)`,
|
||||
onProgress,
|
||||
});
|
||||
|
||||
return { retriedCount: failedDocIds.length };
|
||||
}
|
||||
|
||||
360
backend/src/modules/pkb/services/documentService.ts
Normal file
360
backend/src/modules/pkb/services/documentService.ts
Normal file
@@ -0,0 +1,360 @@
|
||||
import { prisma } from '../../../config/database.js';
|
||||
import { difyClient } from '../../../common/rag/DifyClient.js';
|
||||
import { extractionClient } from '../../../common/document/ExtractionClient.js';
|
||||
|
||||
/**
|
||||
* 文档服务
|
||||
*/
|
||||
|
||||
/**
|
||||
* 上传文档到知识库
|
||||
*/
|
||||
export async function uploadDocument(
|
||||
userId: string,
|
||||
kbId: string,
|
||||
file: Buffer,
|
||||
filename: string,
|
||||
fileType: string,
|
||||
fileSizeBytes: number,
|
||||
fileUrl: string
|
||||
) {
|
||||
// 1. 验证知识库权限
|
||||
const knowledgeBase = await prisma.knowledgeBase.findFirst({
|
||||
where: {
|
||||
id: kbId,
|
||||
userId,
|
||||
},
|
||||
});
|
||||
|
||||
if (!knowledgeBase) {
|
||||
throw new Error('Knowledge base not found or access denied');
|
||||
}
|
||||
|
||||
// 2. 检查文档数量限制(每个知识库最多50个文档)
|
||||
const documentCount = await prisma.document.count({
|
||||
where: { kbId },
|
||||
});
|
||||
|
||||
if (documentCount >= 50) {
|
||||
throw new Error('Document limit exceeded. Maximum 50 documents per knowledge base');
|
||||
}
|
||||
|
||||
// 3. 在数据库中创建文档记录(状态:uploading)
|
||||
const document = await prisma.document.create({
|
||||
data: {
|
||||
kbId,
|
||||
userId,
|
||||
filename,
|
||||
fileType,
|
||||
fileSizeBytes,
|
||||
fileUrl,
|
||||
difyDocumentId: '', // 暂时为空,稍后更新
|
||||
status: 'uploading',
|
||||
progress: 0,
|
||||
},
|
||||
});
|
||||
|
||||
try {
|
||||
// 4. Phase 2: 调用提取服务提取文本内容
|
||||
let extractionResult;
|
||||
let extractedText = '';
|
||||
let extractionMethod = '';
|
||||
let extractionQuality: number | null = null;
|
||||
let charCount: number | null = null;
|
||||
let detectedLanguage: string | null = null;
|
||||
|
||||
try {
|
||||
console.log(`[Phase2] 开始提取文档: ${filename}`);
|
||||
extractionResult = await extractionClient.extractDocument(file, filename);
|
||||
|
||||
if (extractionResult.success) {
|
||||
extractedText = extractionResult.text;
|
||||
extractionMethod = extractionResult.method;
|
||||
extractionQuality = extractionResult.quality || null;
|
||||
charCount = extractionResult.metadata?.char_count || null;
|
||||
detectedLanguage = extractionResult.language || null;
|
||||
|
||||
console.log(`[Phase2] 提取成功: method=${extractionMethod}, chars=${charCount}, language=${detectedLanguage}`);
|
||||
}
|
||||
} catch (extractionError) {
|
||||
console.error('[Phase2] 文档提取失败,但继续上传到Dify:', extractionError);
|
||||
// 提取失败不影响Dify上传,但记录错误
|
||||
}
|
||||
|
||||
// 5. 上传到Dify
|
||||
const difyResult = await difyClient.uploadDocumentDirectly(
|
||||
knowledgeBase.difyDatasetId,
|
||||
file,
|
||||
filename
|
||||
);
|
||||
|
||||
// 6. 更新文档记录(更新difyDocumentId、状态和Phase 2字段)
|
||||
const updatedDocument = await prisma.document.update({
|
||||
where: { id: document.id },
|
||||
data: {
|
||||
difyDocumentId: difyResult.document.id,
|
||||
status: difyResult.document.indexing_status,
|
||||
progress: 50,
|
||||
// Phase 2新增字段
|
||||
extractedText: extractedText || null,
|
||||
extractionMethod: extractionMethod || null,
|
||||
extractionQuality: extractionQuality,
|
||||
charCount: charCount,
|
||||
language: detectedLanguage,
|
||||
},
|
||||
});
|
||||
|
||||
// 7. 启动后台轮询任务,等待处理完成
|
||||
pollDocumentStatus(userId, kbId, document.id, difyResult.document.id).catch(error => {
|
||||
console.error('Failed to poll document status:', error);
|
||||
});
|
||||
|
||||
// 8. 更新知识库统计
|
||||
await updateKnowledgeBaseStats(kbId);
|
||||
|
||||
// 9. 转换BigInt为Number
|
||||
return {
|
||||
...updatedDocument,
|
||||
fileSizeBytes: Number(updatedDocument.fileSizeBytes),
|
||||
};
|
||||
} catch (error) {
|
||||
// 上传失败,更新状态为error
|
||||
await prisma.document.update({
|
||||
where: { id: document.id },
|
||||
data: {
|
||||
status: 'error',
|
||||
errorMessage: error instanceof Error ? error.message : 'Upload failed',
|
||||
},
|
||||
});
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 轮询文档处理状态
|
||||
*/
|
||||
async function pollDocumentStatus(
|
||||
userId: string,
|
||||
kbId: string,
|
||||
documentId: string,
|
||||
difyDocumentId: string,
|
||||
maxAttempts: number = 30
|
||||
) {
|
||||
const knowledgeBase = await prisma.knowledgeBase.findFirst({
|
||||
where: { id: kbId, userId },
|
||||
});
|
||||
|
||||
if (!knowledgeBase) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (let i = 0; i < maxAttempts; i++) {
|
||||
await new Promise(resolve => setTimeout(resolve, 2000)); // 等待2秒
|
||||
|
||||
try {
|
||||
// 查询Dify中的文档状态
|
||||
const difyDocument = await difyClient.getDocument(
|
||||
knowledgeBase.difyDatasetId,
|
||||
difyDocumentId
|
||||
);
|
||||
|
||||
// 更新数据库中的状态
|
||||
await prisma.document.update({
|
||||
where: { id: documentId },
|
||||
data: {
|
||||
status: difyDocument.indexing_status,
|
||||
progress: difyDocument.indexing_status === 'completed' ? 100 : 50 + (i * 2),
|
||||
segmentsCount: difyDocument.indexing_status === 'completed' ? difyDocument.word_count : null,
|
||||
tokensCount: difyDocument.indexing_status === 'completed' ? difyDocument.tokens : null,
|
||||
processedAt: difyDocument.indexing_status === 'completed' ? new Date() : null,
|
||||
errorMessage: difyDocument.error || null,
|
||||
},
|
||||
});
|
||||
|
||||
// 如果完成或失败,退出轮询
|
||||
if (difyDocument.indexing_status === 'completed') {
|
||||
await updateKnowledgeBaseStats(kbId);
|
||||
break;
|
||||
}
|
||||
|
||||
if (difyDocument.indexing_status === 'error') {
|
||||
break;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Polling attempt ${i + 1} failed:`, error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取文档列表
|
||||
*/
|
||||
export async function getDocuments(userId: string, kbId: string) {
|
||||
// 1. 验证权限
|
||||
const knowledgeBase = await prisma.knowledgeBase.findFirst({
|
||||
where: {
|
||||
id: kbId,
|
||||
userId,
|
||||
},
|
||||
});
|
||||
|
||||
if (!knowledgeBase) {
|
||||
throw new Error('Knowledge base not found or access denied');
|
||||
}
|
||||
|
||||
// 2. 查询文档列表
|
||||
const documents = await prisma.document.findMany({
|
||||
where: { kbId },
|
||||
orderBy: { uploadedAt: 'desc' },
|
||||
});
|
||||
|
||||
// 3. 转换BigInt为Number
|
||||
return documents.map(doc => ({
|
||||
...doc,
|
||||
fileSizeBytes: Number(doc.fileSizeBytes),
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取文档详情
|
||||
*/
|
||||
export async function getDocumentById(userId: string, documentId: string) {
|
||||
const document = await prisma.document.findFirst({
|
||||
where: {
|
||||
id: documentId,
|
||||
userId, // 确保只能访问自己的文档
|
||||
},
|
||||
include: {
|
||||
knowledgeBase: true,
|
||||
},
|
||||
});
|
||||
|
||||
if (!document) {
|
||||
throw new Error('Document not found or access denied');
|
||||
}
|
||||
|
||||
// 转换BigInt为Number
|
||||
return {
|
||||
...document,
|
||||
fileSizeBytes: Number(document.fileSizeBytes),
|
||||
knowledgeBase: {
|
||||
...document.knowledgeBase,
|
||||
totalSizeBytes: Number(document.knowledgeBase.totalSizeBytes),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除文档
|
||||
*/
|
||||
export async function deleteDocument(userId: string, documentId: string) {
|
||||
// 1. 查询文档信息
|
||||
const document = await prisma.document.findFirst({
|
||||
where: {
|
||||
id: documentId,
|
||||
userId,
|
||||
},
|
||||
include: {
|
||||
knowledgeBase: true,
|
||||
},
|
||||
});
|
||||
|
||||
if (!document) {
|
||||
throw new Error('Document not found or access denied');
|
||||
}
|
||||
|
||||
// 2. 删除Dify中的文档
|
||||
if (document.difyDocumentId) {
|
||||
try {
|
||||
await difyClient.deleteDocument(
|
||||
document.knowledgeBase.difyDatasetId,
|
||||
document.difyDocumentId
|
||||
);
|
||||
} catch (error) {
|
||||
console.error('Failed to delete Dify document:', error);
|
||||
// 继续删除本地记录
|
||||
}
|
||||
}
|
||||
|
||||
// 3. 删除数据库记录
|
||||
await prisma.document.delete({
|
||||
where: { id: documentId },
|
||||
});
|
||||
|
||||
// 4. 更新知识库统计
|
||||
await updateKnowledgeBaseStats(document.kbId);
|
||||
}
|
||||
|
||||
/**
|
||||
* 重新处理文档
|
||||
*/
|
||||
export async function reprocessDocument(userId: string, documentId: string) {
|
||||
// 1. 查询文档信息
|
||||
const document = await prisma.document.findFirst({
|
||||
where: {
|
||||
id: documentId,
|
||||
userId,
|
||||
},
|
||||
include: {
|
||||
knowledgeBase: true,
|
||||
},
|
||||
});
|
||||
|
||||
if (!document) {
|
||||
throw new Error('Document not found or access denied');
|
||||
}
|
||||
|
||||
// 2. 触发Dify重新索引
|
||||
if (document.difyDocumentId) {
|
||||
try {
|
||||
await difyClient.updateDocument(
|
||||
document.knowledgeBase.difyDatasetId,
|
||||
document.difyDocumentId
|
||||
);
|
||||
|
||||
// 3. 更新状态为processing
|
||||
await prisma.document.update({
|
||||
where: { id: documentId },
|
||||
data: {
|
||||
status: 'parsing',
|
||||
progress: 0,
|
||||
errorMessage: null,
|
||||
},
|
||||
});
|
||||
|
||||
// 4. 启动轮询
|
||||
pollDocumentStatus(
|
||||
userId,
|
||||
document.kbId,
|
||||
documentId,
|
||||
document.difyDocumentId
|
||||
).catch(error => {
|
||||
console.error('Failed to poll document status:', error);
|
||||
});
|
||||
} catch (error) {
|
||||
throw new Error('Failed to reprocess document');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 更新知识库统计信息
|
||||
*/
|
||||
async function updateKnowledgeBaseStats(kbId: string) {
|
||||
const documents = await prisma.document.findMany({
|
||||
where: { kbId },
|
||||
});
|
||||
|
||||
const totalSizeBytes = documents.reduce((sum, d) => sum + Number(d.fileSizeBytes), 0);
|
||||
const fileCount = documents.length;
|
||||
|
||||
await prisma.knowledgeBase.update({
|
||||
where: { id: kbId },
|
||||
data: {
|
||||
fileCount,
|
||||
totalSizeBytes: BigInt(totalSizeBytes),
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
364
backend/src/modules/pkb/services/knowledgeBaseService.ts
Normal file
364
backend/src/modules/pkb/services/knowledgeBaseService.ts
Normal file
@@ -0,0 +1,364 @@
|
||||
import { prisma } from '../../../config/database.js';
|
||||
import { difyClient } from '../../../common/rag/DifyClient.js';
|
||||
import { calculateDocumentTokens, selectDocumentsForFullText, TOKEN_LIMITS } from './tokenService.js';
|
||||
|
||||
/**
|
||||
* 知识库服务
|
||||
*/
|
||||
|
||||
/**
|
||||
* 创建知识库
|
||||
*/
|
||||
export async function createKnowledgeBase(
|
||||
userId: string,
|
||||
name: string,
|
||||
description?: string
|
||||
) {
|
||||
// 1. 检查用户知识库配额
|
||||
const user = await prisma.user.findUnique({
|
||||
where: { id: userId },
|
||||
select: { kbQuota: true, kbUsed: true }
|
||||
});
|
||||
|
||||
if (!user) {
|
||||
throw new Error('User not found');
|
||||
}
|
||||
|
||||
if (user.kbUsed >= user.kbQuota) {
|
||||
throw new Error(`Knowledge base quota exceeded. Maximum: ${user.kbQuota}`);
|
||||
}
|
||||
|
||||
// 2. 在Dify中创建Dataset
|
||||
const difyDataset = await difyClient.createDataset({
|
||||
name: `${userId}_${name}_${Date.now()}`,
|
||||
description: description || `Knowledge base for user ${userId}`,
|
||||
indexing_technique: 'high_quality',
|
||||
});
|
||||
|
||||
// 3. 在数据库中创建记录
|
||||
const knowledgeBase = await prisma.knowledgeBase.create({
|
||||
data: {
|
||||
userId,
|
||||
name,
|
||||
description,
|
||||
difyDatasetId: difyDataset.id,
|
||||
},
|
||||
});
|
||||
|
||||
// 4. 更新用户的知识库使用计数
|
||||
await prisma.user.update({
|
||||
where: { id: userId },
|
||||
data: {
|
||||
kbUsed: { increment: 1 },
|
||||
},
|
||||
});
|
||||
|
||||
// 5. 转换BigInt为Number
|
||||
return {
|
||||
...knowledgeBase,
|
||||
totalSizeBytes: Number(knowledgeBase.totalSizeBytes),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取用户的知识库列表
|
||||
*/
|
||||
export async function getKnowledgeBases(userId: string) {
|
||||
const knowledgeBases = await prisma.knowledgeBase.findMany({
|
||||
where: { userId },
|
||||
orderBy: { createdAt: 'desc' },
|
||||
include: {
|
||||
_count: {
|
||||
select: { documents: true },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
// 转换BigInt为Number
|
||||
return knowledgeBases.map(kb => ({
|
||||
...kb,
|
||||
totalSizeBytes: Number(kb.totalSizeBytes),
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取知识库详情
|
||||
*/
|
||||
export async function getKnowledgeBaseById(userId: string, kbId: string) {
|
||||
const knowledgeBase = await prisma.knowledgeBase.findFirst({
|
||||
where: {
|
||||
id: kbId,
|
||||
userId, // 确保只能访问自己的知识库
|
||||
},
|
||||
include: {
|
||||
documents: {
|
||||
orderBy: { uploadedAt: 'desc' },
|
||||
},
|
||||
_count: {
|
||||
select: { documents: true },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
if (!knowledgeBase) {
|
||||
throw new Error('Knowledge base not found or access denied');
|
||||
}
|
||||
|
||||
// 转换BigInt为Number
|
||||
const result = {
|
||||
...knowledgeBase,
|
||||
totalSizeBytes: Number(knowledgeBase.totalSizeBytes),
|
||||
documents: knowledgeBase.documents.map(doc => ({
|
||||
...doc,
|
||||
fileSizeBytes: Number(doc.fileSizeBytes),
|
||||
})),
|
||||
};
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* 更新知识库
|
||||
*/
|
||||
export async function updateKnowledgeBase(
|
||||
userId: string,
|
||||
kbId: string,
|
||||
data: { name?: string; description?: string }
|
||||
) {
|
||||
// 1. 验证权限
|
||||
const existingKb = await prisma.knowledgeBase.findFirst({
|
||||
where: {
|
||||
id: kbId,
|
||||
userId,
|
||||
},
|
||||
});
|
||||
|
||||
if (!existingKb) {
|
||||
throw new Error('Knowledge base not found or access denied');
|
||||
}
|
||||
|
||||
// 2. 更新数据库
|
||||
const knowledgeBase = await prisma.knowledgeBase.update({
|
||||
where: { id: kbId },
|
||||
data,
|
||||
});
|
||||
|
||||
// 3. 转换BigInt为Number
|
||||
return {
|
||||
...knowledgeBase,
|
||||
totalSizeBytes: Number(knowledgeBase.totalSizeBytes),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* 删除知识库
|
||||
*/
|
||||
export async function deleteKnowledgeBase(userId: string, kbId: string) {
|
||||
// 1. 验证权限
|
||||
const knowledgeBase = await prisma.knowledgeBase.findFirst({
|
||||
where: {
|
||||
id: kbId,
|
||||
userId,
|
||||
},
|
||||
});
|
||||
|
||||
if (!knowledgeBase) {
|
||||
throw new Error('Knowledge base not found or access denied');
|
||||
}
|
||||
|
||||
// 2. 删除Dify中的Dataset
|
||||
try {
|
||||
await difyClient.deleteDataset(knowledgeBase.difyDatasetId);
|
||||
} catch (error) {
|
||||
console.error('Failed to delete Dify dataset:', error);
|
||||
// 继续删除本地记录,即使Dify删除失败
|
||||
}
|
||||
|
||||
// 3. 删除数据库记录(会级联删除documents)
|
||||
await prisma.knowledgeBase.delete({
|
||||
where: { id: kbId },
|
||||
});
|
||||
|
||||
// 4. 更新用户的知识库使用计数
|
||||
await prisma.user.update({
|
||||
where: { id: userId },
|
||||
data: {
|
||||
kbUsed: { decrement: 1 },
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* 检索知识库
|
||||
*/
|
||||
export async function searchKnowledgeBase(
|
||||
userId: string,
|
||||
kbId: string,
|
||||
query: string,
|
||||
topK: number = 15 // Phase 1优化:默认从3增加到15
|
||||
) {
|
||||
console.log('🔍 [searchKnowledgeBase] 开始检索', { kbId, query, topK });
|
||||
|
||||
// 1. 验证权限
|
||||
const knowledgeBase = await prisma.knowledgeBase.findFirst({
|
||||
where: {
|
||||
id: kbId,
|
||||
userId,
|
||||
},
|
||||
});
|
||||
|
||||
if (!knowledgeBase) {
|
||||
console.error('❌ [searchKnowledgeBase] 知识库不存在', { kbId, userId });
|
||||
throw new Error('Knowledge base not found or access denied');
|
||||
}
|
||||
|
||||
console.log('📚 [searchKnowledgeBase] 找到知识库', {
|
||||
id: knowledgeBase.id,
|
||||
name: knowledgeBase.name,
|
||||
difyDatasetId: knowledgeBase.difyDatasetId
|
||||
});
|
||||
|
||||
// 2. 调用Dify检索API
|
||||
console.log('🌐 [searchKnowledgeBase] 调用Dify检索API', {
|
||||
difyDatasetId: knowledgeBase.difyDatasetId,
|
||||
query,
|
||||
topK
|
||||
});
|
||||
|
||||
const results = await difyClient.retrieveKnowledge(
|
||||
knowledgeBase.difyDatasetId,
|
||||
query,
|
||||
{
|
||||
retrieval_model: {
|
||||
search_method: 'semantic_search',
|
||||
top_k: topK,
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
console.log('✅ [searchKnowledgeBase] Dify返回结果', {
|
||||
recordCount: results.records?.length || 0,
|
||||
hasRecords: results.records && results.records.length > 0
|
||||
});
|
||||
|
||||
if (results.records && results.records.length > 0) {
|
||||
console.log('📄 [searchKnowledgeBase] 检索到的记录:',
|
||||
results.records.map((r: any) => ({
|
||||
score: r.score,
|
||||
contentPreview: r.segment?.content?.substring(0, 100)
|
||||
}))
|
||||
);
|
||||
} else {
|
||||
console.warn('⚠️ [searchKnowledgeBase] 没有检索到任何记录');
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取知识库统计信息
|
||||
*/
|
||||
export async function getKnowledgeBaseStats(userId: string, kbId: string) {
|
||||
// 1. 验证权限
|
||||
const knowledgeBase = await prisma.knowledgeBase.findFirst({
|
||||
where: {
|
||||
id: kbId,
|
||||
userId,
|
||||
},
|
||||
include: {
|
||||
documents: true,
|
||||
},
|
||||
});
|
||||
|
||||
if (!knowledgeBase) {
|
||||
throw new Error('Knowledge base not found or access denied');
|
||||
}
|
||||
|
||||
// 2. 统计信息
|
||||
const stats = {
|
||||
totalDocuments: knowledgeBase.documents.length,
|
||||
completedDocuments: knowledgeBase.documents.filter(d => d.status === 'completed').length,
|
||||
processingDocuments: knowledgeBase.documents.filter(d =>
|
||||
['uploading', 'parsing', 'indexing'].includes(d.status)
|
||||
).length,
|
||||
errorDocuments: knowledgeBase.documents.filter(d => d.status === 'error').length,
|
||||
totalSizeBytes: knowledgeBase.totalSizeBytes,
|
||||
totalTokens: knowledgeBase.documents.reduce((sum, d) => sum + (d.tokensCount || 0), 0),
|
||||
};
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取知识库文档选择(用于全文阅读模式)
|
||||
* Phase 2新增:根据Token限制选择文档
|
||||
*/
|
||||
export async function getDocumentSelection(
|
||||
userId: string,
|
||||
kbId: string,
|
||||
maxFiles?: number,
|
||||
maxTokens?: number
|
||||
) {
|
||||
// 1. 验证权限
|
||||
const knowledgeBase = await prisma.knowledgeBase.findFirst({
|
||||
where: { id: kbId, userId },
|
||||
include: {
|
||||
documents: {
|
||||
where: {
|
||||
status: 'completed', // 只选择已完成的文档
|
||||
},
|
||||
select: {
|
||||
id: true,
|
||||
filename: true,
|
||||
extractedText: true,
|
||||
charCount: true,
|
||||
extractionMethod: true,
|
||||
tokensCount: true,
|
||||
fileSizeBytes: true,
|
||||
},
|
||||
orderBy: { uploadedAt: 'desc' },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
if (!knowledgeBase) {
|
||||
throw new Error('Knowledge base not found or access denied');
|
||||
}
|
||||
|
||||
// 2. 计算每个文档的Token数
|
||||
const documentTokens = calculateDocumentTokens(knowledgeBase.documents);
|
||||
|
||||
// 3. 选择文档(根据Token限制)
|
||||
const selection = selectDocumentsForFullText(
|
||||
documentTokens,
|
||||
maxFiles || TOKEN_LIMITS.MAX_FILES,
|
||||
maxTokens || TOKEN_LIMITS.MAX_TOTAL_TOKENS
|
||||
);
|
||||
|
||||
// 4. 返回结果
|
||||
return {
|
||||
knowledgeBaseId: kbId,
|
||||
knowledgeBaseName: knowledgeBase.name,
|
||||
limits: {
|
||||
maxFiles: maxFiles || TOKEN_LIMITS.MAX_FILES,
|
||||
maxTokens: maxTokens || TOKEN_LIMITS.MAX_TOTAL_TOKENS,
|
||||
},
|
||||
selection: {
|
||||
selectedCount: selection.totalFiles,
|
||||
selectedTokens: selection.totalTokens,
|
||||
excludedCount: selection.excludedDocuments.length,
|
||||
availableTokens: selection.availableTokens,
|
||||
reason: selection.reason,
|
||||
},
|
||||
selectedDocuments: selection.selectedDocuments.map(doc => ({
|
||||
...doc,
|
||||
// 查找原始文档信息
|
||||
...knowledgeBase.documents.find(d => d.id === doc.documentId),
|
||||
})),
|
||||
excludedDocuments: selection.excludedDocuments.map(doc => ({
|
||||
...doc,
|
||||
// 查找原始文档信息
|
||||
...knowledgeBase.documents.find(d => d.id === doc.documentId),
|
||||
})),
|
||||
};
|
||||
}
|
||||
232
backend/src/modules/pkb/services/tokenService.ts
Normal file
232
backend/src/modules/pkb/services/tokenService.ts
Normal file
@@ -0,0 +1,232 @@
|
||||
import { encoding_for_model, Tiktoken } from 'tiktoken';
|
||||
|
||||
/**
|
||||
* Token计数服务
|
||||
* 用于全文阅读模式的Token管理
|
||||
*/
|
||||
|
||||
// Token限制配置
|
||||
export const TOKEN_LIMITS = {
|
||||
MAX_FILES: 50, // 最多50个文件
|
||||
MAX_TOTAL_TOKENS: 980000, // 最多980K tokens(为Qwen-Long 1M上下文留20K余量)
|
||||
CONTEXT_RESERVE: 20000, // 预留给系统提示词和用户查询的token
|
||||
};
|
||||
|
||||
// 缓存编码器
|
||||
let encoderCache: Tiktoken | null = null;
|
||||
|
||||
/**
|
||||
* 获取编码器(使用gpt-4作为Qwen的替代)
|
||||
*/
|
||||
function getEncoder(): Tiktoken {
|
||||
if (!encoderCache) {
|
||||
// Qwen使用类似GPT-4的tokenizer
|
||||
encoderCache = encoding_for_model('gpt-4');
|
||||
}
|
||||
return encoderCache;
|
||||
}
|
||||
|
||||
/**
|
||||
* 计算文本的Token数
|
||||
*/
|
||||
export function countTokens(text: string): number {
|
||||
if (!text || text.trim().length === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
try {
|
||||
const encoder = getEncoder();
|
||||
const tokens = encoder.encode(text);
|
||||
return tokens.length;
|
||||
} catch (error) {
|
||||
console.error('[TokenService] Failed to count tokens:', error);
|
||||
// 降级:粗略估算(中文约1.5字符/token,英文约4字符/token)
|
||||
const chineseChars = (text.match(/[\u4e00-\u9fff]/g) || []).length;
|
||||
const totalChars = text.length;
|
||||
const englishChars = totalChars - chineseChars;
|
||||
|
||||
return Math.ceil(chineseChars / 1.5 + englishChars / 4);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 批量计算多个文本的Token数
|
||||
*/
|
||||
export function countTokensBatch(texts: string[]): number[] {
|
||||
return texts.map(text => countTokens(text));
|
||||
}
|
||||
|
||||
/**
|
||||
* 计算文档Token数(基于提取的文本)
|
||||
*/
|
||||
export interface DocumentTokenInfo {
|
||||
documentId: string;
|
||||
filename: string;
|
||||
charCount: number;
|
||||
estimatedTokens: number;
|
||||
extractionMethod?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* 为文档列表计算Token数
|
||||
*/
|
||||
export function calculateDocumentTokens(
|
||||
documents: Array<{
|
||||
id: string;
|
||||
filename: string;
|
||||
extractedText?: string | null;
|
||||
charCount?: number | null;
|
||||
extractionMethod?: string | null;
|
||||
}>
|
||||
): DocumentTokenInfo[] {
|
||||
return documents.map(doc => {
|
||||
let estimatedTokens = 0;
|
||||
|
||||
if (doc.extractedText) {
|
||||
// 使用提取的文本计算精确token数
|
||||
estimatedTokens = countTokens(doc.extractedText);
|
||||
} else if (doc.charCount) {
|
||||
// 如果没有提取文本,使用字符数估算
|
||||
// 假设中英文混合,平均2.5字符/token
|
||||
estimatedTokens = Math.ceil(doc.charCount / 2.5);
|
||||
}
|
||||
|
||||
return {
|
||||
documentId: doc.id,
|
||||
filename: doc.filename,
|
||||
charCount: doc.charCount || 0,
|
||||
estimatedTokens,
|
||||
extractionMethod: doc.extractionMethod || undefined,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* 选择文档以满足Token限制
|
||||
* 策略:优先选择Token数少的文档,直到达到限制
|
||||
*/
|
||||
export interface DocumentSelectionResult {
|
||||
selectedDocuments: DocumentTokenInfo[];
|
||||
totalTokens: number;
|
||||
totalFiles: number;
|
||||
excludedDocuments: DocumentTokenInfo[];
|
||||
reason: 'all_included' | 'file_limit' | 'token_limit';
|
||||
availableTokens: number;
|
||||
}
|
||||
|
||||
export function selectDocumentsForFullText(
|
||||
documents: DocumentTokenInfo[],
|
||||
maxFiles: number = TOKEN_LIMITS.MAX_FILES,
|
||||
maxTokens: number = TOKEN_LIMITS.MAX_TOTAL_TOKENS
|
||||
): DocumentSelectionResult {
|
||||
// 按Token数升序排序(优先选择小文件)
|
||||
const sortedDocs = [...documents].sort(
|
||||
(a, b) => a.estimatedTokens - b.estimatedTokens
|
||||
);
|
||||
|
||||
const selected: DocumentTokenInfo[] = [];
|
||||
const excluded: DocumentTokenInfo[] = [];
|
||||
let totalTokens = 0;
|
||||
|
||||
for (const doc of sortedDocs) {
|
||||
// 检查文件数限制
|
||||
if (selected.length >= maxFiles) {
|
||||
excluded.push(doc);
|
||||
continue;
|
||||
}
|
||||
|
||||
// 检查Token限制
|
||||
if (totalTokens + doc.estimatedTokens > maxTokens) {
|
||||
excluded.push(doc);
|
||||
continue;
|
||||
}
|
||||
|
||||
// 添加到选中列表
|
||||
selected.push(doc);
|
||||
totalTokens += doc.estimatedTokens;
|
||||
}
|
||||
|
||||
// 判断限制原因
|
||||
let reason: 'all_included' | 'file_limit' | 'token_limit' = 'all_included';
|
||||
if (excluded.length > 0) {
|
||||
if (selected.length >= maxFiles) {
|
||||
reason = 'file_limit';
|
||||
} else {
|
||||
reason = 'token_limit';
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
selectedDocuments: selected,
|
||||
totalTokens,
|
||||
totalFiles: selected.length,
|
||||
excludedDocuments: excluded,
|
||||
reason,
|
||||
availableTokens: maxTokens - totalTokens,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* 估算查询需要的Token数
|
||||
*/
|
||||
export function estimateQueryTokens(query: string, systemPrompt?: string): number {
|
||||
let total = countTokens(query);
|
||||
|
||||
if (systemPrompt) {
|
||||
total += countTokens(systemPrompt);
|
||||
}
|
||||
|
||||
// 为响应预留空间
|
||||
total += 2000; // 假设响应最多2000 tokens
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查是否超过Token限制
|
||||
*/
|
||||
export function checkTokenLimit(
|
||||
documentsTokens: number,
|
||||
queryTokens: number,
|
||||
maxTokens: number = TOKEN_LIMITS.MAX_TOTAL_TOKENS
|
||||
): {
|
||||
withinLimit: boolean;
|
||||
totalTokens: number;
|
||||
maxTokens: number;
|
||||
remaining: number;
|
||||
} {
|
||||
const totalTokens = documentsTokens + queryTokens;
|
||||
const remaining = maxTokens - totalTokens;
|
||||
|
||||
return {
|
||||
withinLimit: remaining >= 0,
|
||||
totalTokens,
|
||||
maxTokens,
|
||||
remaining,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* 释放编码器(清理资源)
|
||||
*/
|
||||
export function cleanup() {
|
||||
if (encoderCache) {
|
||||
encoderCache.free();
|
||||
encoderCache = null;
|
||||
}
|
||||
}
|
||||
|
||||
// 进程退出时清理
|
||||
if (typeof process !== 'undefined') {
|
||||
process.on('exit', cleanup);
|
||||
process.on('SIGINT', () => {
|
||||
cleanup();
|
||||
process.exit();
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
152
backend/src/modules/pkb/templates/clinicalResearch.ts
Normal file
152
backend/src/modules/pkb/templates/clinicalResearch.ts
Normal file
@@ -0,0 +1,152 @@
|
||||
/**
|
||||
* Phase 3: 批处理模式 - 临床研究信息提取模板
|
||||
*
|
||||
* 提取临床研究的8个核心字段:
|
||||
* 1. 研究目的
|
||||
* 2. 研究设计
|
||||
* 3. 研究对象
|
||||
* 4. 样本量(text类型,保留原文描述)
|
||||
* 5. 干预组
|
||||
* 6. 对照组
|
||||
* 7. 结果及数据
|
||||
* 8. 牛津评级(提供详细标准)
|
||||
*/
|
||||
|
||||
export interface TemplateField {
|
||||
key: string;
|
||||
label: string;
|
||||
type: 'text' | 'longtext' | 'number';
|
||||
description?: string;
|
||||
}
|
||||
|
||||
export interface BatchTemplate {
|
||||
id: string;
|
||||
name: string;
|
||||
description: string;
|
||||
outputFields: TemplateField[];
|
||||
systemPrompt: string;
|
||||
userPrompt: string;
|
||||
}
|
||||
|
||||
export const CLINICAL_RESEARCH_TEMPLATE: BatchTemplate = {
|
||||
id: 'clinical_research',
|
||||
name: '临床研究信息提取',
|
||||
description: '提取研究目的、设计、对象、样本量、干预、对照、结果、证据等级',
|
||||
|
||||
outputFields: [
|
||||
{
|
||||
key: 'research_purpose',
|
||||
label: '研究目的',
|
||||
type: 'text',
|
||||
description: '研究想要解决的问题或验证的假设'
|
||||
},
|
||||
{
|
||||
key: 'research_design',
|
||||
label: '研究设计',
|
||||
type: 'text',
|
||||
description: '研究类型(RCT、队列研究等)'
|
||||
},
|
||||
{
|
||||
key: 'research_subjects',
|
||||
label: '研究对象',
|
||||
type: 'text',
|
||||
description: '纳入/排除标准、人群特征'
|
||||
},
|
||||
{
|
||||
key: 'sample_size',
|
||||
label: '样本量',
|
||||
type: 'text', // ✅ text类型,保留原文描述
|
||||
description: '实际纳入的受试者人数'
|
||||
},
|
||||
{
|
||||
key: 'intervention_group',
|
||||
label: '干预组',
|
||||
type: 'text',
|
||||
description: '实验组的干预措施'
|
||||
},
|
||||
{
|
||||
key: 'control_group',
|
||||
label: '对照组',
|
||||
type: 'text',
|
||||
description: '对照组的情况'
|
||||
},
|
||||
{
|
||||
key: 'results_data',
|
||||
label: '结果及数据',
|
||||
type: 'longtext',
|
||||
description: '主要结局指标的具体数据'
|
||||
},
|
||||
{
|
||||
key: 'oxford_level',
|
||||
label: '牛津评级',
|
||||
type: 'text',
|
||||
description: '证据等级(1a-5)'
|
||||
},
|
||||
],
|
||||
|
||||
systemPrompt: `你是一个专业的临床研究数据提取助手。
|
||||
你的任务是从临床研究文献中提取结构化信息。
|
||||
你的回答必须严格遵循JSON格式,不要有任何额外的文字说明。`,
|
||||
|
||||
userPrompt: `请仔细阅读这篇临床研究文献,提取以下信息:
|
||||
|
||||
1. **研究目的**:本研究想要解决什么问题或验证什么假设?用1-2句话概括。
|
||||
|
||||
2. **研究设计**:研究类型,如随机对照试验(RCT)、队列研究、病例对照研究、横断面研究、系统评价/Meta分析等。
|
||||
|
||||
3. **研究对象**:描述纳入标准、排除标准、人群特征(年龄、性别、疾病状态等)。
|
||||
|
||||
4. **样本量**:实际纳入的受试者人数,保留原文描述(如"干预组156人,对照组152人,共308人")。
|
||||
|
||||
5. **干预组**:实验组接受的治疗或干预措施,包括药物名称、剂量、给药方式、疗程等。
|
||||
|
||||
6. **对照组**:对照组的情况,如安慰剂、标准治疗、空白对照等。
|
||||
|
||||
7. **结果及数据**:主要结局指标的具体数据、统计结果、P值、置信区间等。包括基线数据对比和终点数据对比。
|
||||
|
||||
8. **牛津评级**:根据研究设计判断证据等级,参考以下标准:
|
||||
- **1a**:系统评价/Meta分析(多个RCT的汇总分析)
|
||||
- **1b**:单个随机对照试验(RCT)
|
||||
- **2a**:设计良好的对照研究(无随机化)
|
||||
- **2b**:设计良好的准实验研究(队列研究、病例对照研究)
|
||||
- **3a**:描述性研究(横断面研究、病例系列)
|
||||
- **3b**:个案报告(单一病例)
|
||||
- **4**:专家意见、共识声明
|
||||
- **5**:基础研究(动物实验、体外研究)
|
||||
|
||||
请严格按照以下JSON格式输出,不要有任何额外说明或前言:
|
||||
{
|
||||
"research_purpose": "...",
|
||||
"research_design": "...",
|
||||
"research_subjects": "...",
|
||||
"sample_size": "...",
|
||||
"intervention_group": "...",
|
||||
"control_group": "...",
|
||||
"results_data": "...",
|
||||
"oxford_level": "..."
|
||||
}`,
|
||||
};
|
||||
|
||||
// 导出所有预设模板
|
||||
export const PRESET_TEMPLATES: Record<string, BatchTemplate> = {
|
||||
[CLINICAL_RESEARCH_TEMPLATE.id]: CLINICAL_RESEARCH_TEMPLATE,
|
||||
};
|
||||
|
||||
// 获取模板
|
||||
export function getTemplate(templateId: string): BatchTemplate | null {
|
||||
return PRESET_TEMPLATES[templateId] || null;
|
||||
}
|
||||
|
||||
// 获取所有模板列表
|
||||
export function getAllTemplates(): BatchTemplate[] {
|
||||
return Object.values(PRESET_TEMPLATES);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user