Files
AIclinicalresearch/backend/src/modules/dc/tool-b/controllers/ExtractionController.ts
HaHafeng 8a17369138 feat(dc): Complete Tool B MVP with full API integration and bug fixes
Phase 5: Export Feature
- Add Excel export API endpoint (GET /tasks/:id/export)
- Fix Content-Disposition header encoding for Chinese filenames
- Fix export field order to match template definition
- Export finalResult or resultA as fallback

API Integration Fixes (Phase 1-5):
- Fix API response parsing (return result.data consistently)
- Fix field name mismatch (fileKey -> sourceFileKey)
- Fix Excel parsing bug (range:99 -> slice(0,100))
- Add file upload with Excel parsing (columns, totalRows)
- Add detailed error logging for debugging

LLM Integration Fixes:
- Fix LLM call method: LLMFactory.createLLM -> getAdapter
- Fix adapter interface: generateText -> chat([messages])
- Fix response fields: text -> content, tokensUsed -> usage.totalTokens
- Fix model names: qwen-max -> qwen3-72b

React Infinite Loop Fixes:
- Step2: Remove updateState from useEffect deps
- Step3: Add useRef to prevent Strict Mode double execution
- Step3: Clear interval on API failure (max 3 retries)
- Step4: Add useRef to prevent infinite data loading
- Add cleanup functions to all useEffect hooks

Frontend Enhancements:
- Add comprehensive error handling with user-friendly messages
- Remove debug console.logs (production ready)
- Fix TypeScript type definitions (TaskProgress, ExtractionItem)
- Improve Step4Verify data transformation logic

Backend Enhancements:
- Add detailed logging at each step for debugging
- Add parameter validation in controllers
- Improve error messages with stack traces (dev mode)
- Add export field ordering by template definition

Documentation Updates:
- Update module status: Tool B MVP completed
- Create MVP completion summary (06-开发记录)
- Create technical debt document (07-技术债务)
- Update API documentation with test status
- Update database documentation with verified status
- Update system overview with DC module status
- Document 4 known issues (Excel preprocessing, progress display, etc.)

Testing Results:
- File upload: 9 rows parsed successfully
- Health check: Column validation working
- Dual model extraction: DeepSeek-V3 + Qwen-Max both working
- Processing time: ~49s for 9 records (~5s per record)
- Token usage: ~10k tokens total (~1.1k per record)
- Conflict detection: 1 clean, 8 conflicts (88.9% conflict rate)
- Excel export: Working with proper encoding

Files Changed:
Backend (~500 lines):
- ExtractionController.ts: Add upload endpoint, improve logging
- DualModelExtractionService.ts: Fix LLM call methods, add detailed logs
- HealthCheckService.ts: Fix Excel range parsing
- routes/index.ts: Add upload route

Frontend (~200 lines):
- toolB.ts: Fix API response parsing, add error handling
- Step1Upload.tsx: Integrate upload and health check APIs
- Step2Schema.tsx: Fix infinite loop, load templates from API
- Step3Processing.tsx: Fix infinite loop, integrate progress polling
- Step4Verify.tsx: Fix infinite loop, transform backend data correctly
- Step5Result.tsx: Integrate export API
- index.tsx: Add file metadata to state

Scripts:
- check-task-progress.mjs: Database inspection utility

Docs (~8 files):
- 00-模块当前状态与开发指南.md: Update to v2.0
- API设计文档.md: Mark all endpoints as tested
- 数据库设计文档.md: Update verification status
- DC模块Tool-B开发计划.md: Add MVP completion notice
- DC模块Tool-B开发任务清单.md: Update progress to 100%
- Tool-B-MVP完成总结.md: New completion summary
- Tool-B技术债务清单.md: New technical debt document
- 00-系统当前状态与开发指南.md: Update DC module status

Status: Tool B MVP complete and production ready
2025-12-03 15:07:39 +08:00

592 lines
17 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* DC模块 - 提取控制器
*
* API端点
* - POST /api/v1/dc/tool-b/health-check - 健康检查
* - GET /api/v1/dc/tool-b/templates - 获取模板列表
* - POST /api/v1/dc/tool-b/tasks - 创建提取任务
* - GET /api/v1/dc/tool-b/tasks/:taskId/progress - 查询任务进度
* - GET /api/v1/dc/tool-b/tasks/:taskId/items - 获取验证网格数据
* - POST /api/v1/dc/tool-b/items/:itemId/resolve - 裁决冲突
*
* 平台能力复用:
* - ✅ logger: 日志记录
* - ✅ prisma: 数据库操作
* - ✅ storage: 文件操作
* - ✅ jobQueue: 异步任务
*/
import { FastifyRequest, FastifyReply } from 'fastify';
import { healthCheckService } from '../services/HealthCheckService.js';
import { templateService } from '../services/TemplateService.js';
import { dualModelExtractionService } from '../services/DualModelExtractionService.js';
import { conflictDetectionService } from '../services/ConflictDetectionService.js';
import { storage } from '../../../../common/storage/index.js';
import { logger } from '../../../../common/logging/index.js';
import { prisma } from '../../../../config/database.js';
import * as xlsx from 'xlsx';
export class ExtractionController {
/**
* 文件上传
* POST /upload
*/
async uploadFile(request: FastifyRequest, reply: FastifyReply) {
try {
const data = await request.file();
if (!data) {
return reply.code(400).send({
success: false,
error: 'No file uploaded'
});
}
const userId = (request as any).userId || 'default-user';
const buffer = await data.toBuffer();
const originalFilename = data.filename;
const timestamp = Date.now();
const fileKey = `dc/tool-b/${userId}/${timestamp}_${originalFilename}`;
logger.info('[API] File upload request', {
filename: originalFilename,
size: buffer.length,
userId
});
// 解析Excel文件获取列名和行数
const workbook = xlsx.read(buffer, { type: 'buffer' });
const sheetName = workbook.SheetNames[0];
const worksheet = workbook.Sheets[sheetName];
const jsonData = xlsx.utils.sheet_to_json<Record<string, any>>(worksheet);
// 获取列名从第一行数据的keys
const columns = jsonData.length > 0 ? Object.keys(jsonData[0]) : [];
const totalRows = jsonData.length;
logger.info('[API] Excel parsed', { columns, totalRows });
// 上传到storage
const url = await storage.upload(fileKey, buffer);
logger.info('[API] File uploaded successfully', { fileKey, url });
return reply.code(200).send({
success: true,
data: {
fileKey,
url,
filename: originalFilename,
size: buffer.length,
totalRows,
columns
}
});
} catch (error) {
logger.error('[API] File upload failed', { error });
return reply.code(500).send({
success: false,
error: String(error)
});
}
}
/**
* 健康检查
* POST /health-check
*/
async healthCheck(request: FastifyRequest<{
Body: {
fileKey: string;
columnName: string;
}
}>, reply: FastifyReply) {
try {
const { fileKey, columnName } = request.body;
const userId = (request as any).userId || 'default-user'; // TODO: 从auth middleware获取
logger.info('[API] Health check request', { fileKey, columnName, userId });
// 参数验证
if (!fileKey || !columnName) {
logger.error('[API] Missing required parameters', { fileKey, columnName });
return reply.code(400).send({
success: false,
error: 'Missing required parameters: fileKey or columnName'
});
}
const result = await healthCheckService.check(fileKey, columnName, userId);
logger.info('[API] Health check success', { status: result.status });
return reply.code(200).send({
success: true,
data: result
});
} catch (error: any) {
logger.error('[API] Health check failed', {
error: error.message,
stack: error.stack,
fileKey: request.body?.fileKey,
columnName: request.body?.columnName
});
return reply.code(500).send({
success: false,
error: error.message || String(error),
details: process.env.NODE_ENV === 'development' ? error.stack : undefined
});
}
}
/**
* 获取模板列表
* GET /templates
*/
async getTemplates(request: FastifyRequest, reply: FastifyReply) {
try {
logger.info('[API] Get templates request');
const templates = await templateService.getAllTemplates();
return reply.code(200).send({
success: true,
data: { templates }
});
} catch (error) {
logger.error('[API] Get templates failed', { error });
return reply.code(500).send({
success: false,
error: String(error)
});
}
}
/**
* 创建提取任务
* POST /tasks
*/
async createTask(request: FastifyRequest<{
Body: {
projectName: string;
sourceFileKey: string;
textColumn: string;
diseaseType: string;
reportType: string;
modelA?: string;
modelB?: string;
}
}>, reply: FastifyReply) {
try {
logger.info('[API] ===== CREATE TASK START =====');
const {
projectName,
sourceFileKey,
textColumn,
diseaseType,
reportType,
modelA = 'deepseek-v3',
modelB = 'qwen-max'
} = request.body;
const userId = (request as any).userId || 'default-user';
logger.info('[API] Create task request', {
userId,
projectName,
sourceFileKey,
textColumn,
diseaseType,
reportType
});
// 1. 获取模板
logger.info('[API] Step 1: Getting template', { diseaseType, reportType });
const template = await templateService.getTemplate(diseaseType, reportType);
if (!template) {
logger.error('[API] Template not found', { diseaseType, reportType });
return reply.code(404).send({
success: false,
error: `Template not found: ${diseaseType}/${reportType}`
});
}
logger.info('[API] Template found', { templateId: template.id });
// 2. 读取Excel文件创建items
logger.info('[API] Step 2: Downloading Excel file', { sourceFileKey });
const fileBuffer = await storage.download(sourceFileKey);
if (!fileBuffer) {
logger.error('[API] File not found in storage', { sourceFileKey });
return reply.code(404).send({
success: false,
error: `File not found: ${sourceFileKey}`
});
}
logger.info('[API] File downloaded', { size: fileBuffer.length });
logger.info('[API] Step 3: Parsing Excel file');
const workbook = xlsx.read(fileBuffer, { type: 'buffer' });
const sheetName = workbook.SheetNames[0];
const worksheet = workbook.Sheets[sheetName];
const data = xlsx.utils.sheet_to_json<Record<string, any>>(worksheet);
logger.info('[API] Excel parsed', { rowCount: data.length });
if (!data[0].hasOwnProperty(textColumn)) {
logger.error('[API] Column not found', {
textColumn,
availableColumns: Object.keys(data[0])
});
return reply.code(400).send({
success: false,
error: `Column '${textColumn}' not found in Excel`
});
}
// 3. 创建任务
logger.info('[API] Step 4: Creating task in database');
const task = await prisma.dCExtractionTask.create({
data: {
userId,
projectName,
sourceFileKey,
textColumn,
diseaseType,
reportType,
targetFields: template.fields as any, // Prisma Json类型
modelA,
modelB,
totalCount: data.length,
status: 'pending'
}
});
logger.info('[API] Task created in database', { taskId: task.id });
// 4. 创建items
logger.info('[API] Step 5: Creating extraction items', { count: data.length });
const itemsData = data.map((row, index) => ({
taskId: task.id,
rowIndex: index + 1,
originalText: String(row[textColumn] || '')
}));
await prisma.dCExtractionItem.createMany({
data: itemsData
});
logger.info('[API] Items created', { count: itemsData.length });
// 5. 启动异步任务
// TODO: 使用jobQueue.add()
// 暂时直接调用
logger.info('[API] Starting batch extraction (async)', { taskId: task.id });
dualModelExtractionService.batchExtract(task.id)
.then(() => {
logger.info('[API] Batch extraction completed successfully', { taskId: task.id });
})
.catch(err => {
logger.error('[API] Batch extraction failed', {
error: err.message,
stack: err.stack,
taskId: task.id
});
});
logger.info('[API] Task created', { taskId: task.id, itemCount: data.length });
return reply.code(201).send({
success: true,
data: {
taskId: task.id,
totalCount: data.length,
status: 'pending'
}
});
} catch (error) {
logger.error('[API] Create task failed', { error });
return reply.code(500).send({
success: false,
error: String(error)
});
}
}
/**
* 查询任务进度
* GET /tasks/:taskId/progress
*/
async getTaskProgress(request: FastifyRequest<{
Params: { taskId: string }
}>, reply: FastifyReply) {
try {
const { taskId } = request.params;
logger.info('[API] Get task progress', { taskId });
const task = await prisma.dCExtractionTask.findUnique({
where: { id: taskId }
});
if (!task) {
return reply.code(404).send({
success: false,
error: 'Task not found'
});
}
return reply.code(200).send({
success: true,
data: {
taskId: task.id,
status: task.status,
totalCount: task.totalCount,
processedCount: task.processedCount,
cleanCount: task.cleanCount,
conflictCount: task.conflictCount,
failedCount: task.failedCount,
totalTokens: task.totalTokens,
totalCost: task.totalCost,
progress: task.totalCount > 0 ? Math.round((task.processedCount / task.totalCount) * 100) : 0
}
});
} catch (error) {
logger.error('[API] Get task progress failed', { error });
return reply.code(500).send({
success: false,
error: String(error)
});
}
}
/**
* 获取验证网格数据
* GET /tasks/:taskId/items
*/
async getTaskItems(request: FastifyRequest<{
Params: { taskId: string };
Querystring: { page?: string; limit?: string; status?: string }
}>, reply: FastifyReply) {
try {
const { taskId } = request.params;
const page = parseInt(request.query.page || '1');
const limit = parseInt(request.query.limit || '50');
const statusFilter = request.query.status;
logger.info('[API] Get task items', { taskId, page, limit, statusFilter });
const where: any = { taskId };
if (statusFilter) {
where.status = statusFilter;
}
const [items, total] = await Promise.all([
prisma.dCExtractionItem.findMany({
where,
skip: (page - 1) * limit,
take: limit,
orderBy: { rowIndex: 'asc' }
}),
prisma.dCExtractionItem.count({ where })
]);
return reply.code(200).send({
success: true,
data: {
items: items.map(item => ({
id: item.id,
rowIndex: item.rowIndex,
originalText: item.originalText,
resultA: item.resultA,
resultB: item.resultB,
status: item.status,
conflictFields: item.conflictFields,
finalResult: item.finalResult
})),
pagination: {
page,
limit,
total,
totalPages: Math.ceil(total / limit)
}
}
});
} catch (error) {
logger.error('[API] Get task items failed', { error });
return reply.code(500).send({
success: false,
error: String(error)
});
}
}
/**
* 裁决冲突
* POST /items/:itemId/resolve
*/
async resolveConflict(request: FastifyRequest<{
Params: { itemId: string };
Body: {
field: string;
chosenValue: string;
}
}>, reply: FastifyReply) {
try {
const { itemId } = request.params;
const { field, chosenValue } = request.body;
logger.info('[API] Resolve conflict', { itemId, field });
// 获取当前记录
const item = await prisma.dCExtractionItem.findUnique({
where: { id: itemId }
});
if (!item) {
return reply.code(404).send({
success: false,
error: 'Item not found'
});
}
// 更新finalResult
const finalResult = { ...(item.finalResult as Record<string, string> || {}) };
finalResult[field] = chosenValue;
// 移除已解决的冲突字段
const conflictFields = item.conflictFields.filter(f => f !== field);
// 更新状态
const newStatus = conflictFields.length === 0 ? 'resolved' : 'conflict';
await prisma.dCExtractionItem.update({
where: { id: itemId },
data: {
finalResult,
conflictFields,
status: newStatus,
resolvedAt: conflictFields.length === 0 ? new Date() : null
}
});
logger.info('[API] Conflict resolved', { itemId, field, newStatus });
return reply.code(200).send({
success: true,
data: {
itemId,
status: newStatus,
remainingConflicts: conflictFields.length
}
});
} catch (error) {
logger.error('[API] Resolve conflict failed', { error });
return reply.code(500).send({
success: false,
error: String(error)
});
}
}
/**
* 导出结果
* GET /tasks/:taskId/export
*/
async exportResults(request: FastifyRequest<{
Params: { taskId: string };
}>, reply: FastifyReply) {
try {
const { taskId } = request.params;
logger.info('[API] Export results request', { taskId });
// 获取任务和所有items
const task = await prisma.dCExtractionTask.findUnique({
where: { id: taskId },
include: {
items: {
orderBy: { rowIndex: 'asc' }
}
}
});
if (!task) {
return reply.code(404).send({
success: false,
error: 'Task not found'
});
}
// 创建Excel工作簿
const workbook = xlsx.utils.book_new();
// 🔑 获取字段顺序从targetFields
const targetFields = task.targetFields as { name: string; desc: string }[];
const fieldNames = targetFields.map(f => f.name);
// 构建数据行,按模板字段顺序
const rows = task.items.map(item => {
// 优先使用finalResult如果为空则使用resultA
const finalResult = item.finalResult as Record<string, string> | null;
const resultA = item.resultA as Record<string, string> | null;
const extractedData = finalResult || resultA || {};
// 🔑 按字段顺序构建行对象
const row: Record<string, any> = {
'行号': item.rowIndex,
'原文': item.originalText,
'状态': item.status === 'resolved' ? '已解决' : item.status === 'clean' ? '一致' : '待裁决'
};
// 按模板定义的顺序添加字段
fieldNames.forEach(fieldName => {
row[fieldName] = extractedData[fieldName] || '未提及';
});
return row;
});
// 创建工作表
const worksheet = xlsx.utils.json_to_sheet(rows);
xlsx.utils.book_append_sheet(workbook, worksheet, '提取结果');
// 生成Excel Buffer
const excelBuffer = xlsx.write(workbook, { type: 'buffer', bookType: 'xlsx' });
logger.info('[API] Export results success', { taskId, rowCount: rows.length });
// 返回文件
// 🔑 对文件名进行URL编码以支持中文
const filename = `${task.projectName}_结果.xlsx`;
const encodedFilename = encodeURIComponent(filename);
return reply
.code(200)
.header('Content-Type', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
.header('Content-Disposition', `attachment; filename*=UTF-8''${encodedFilename}`)
.send(excelBuffer);
} catch (error) {
logger.error('[API] Export results failed', { error });
return reply.code(500).send({
success: false,
error: String(error)
});
}
}
}
// 导出单例
export const extractionController = new ExtractionController();