Files
AIclinicalresearch/backend/src/modules/dc/tool-b/services/DualModelExtractionService.ts
HaHafeng 8a17369138 feat(dc): Complete Tool B MVP with full API integration and bug fixes
Phase 5: Export Feature
- Add Excel export API endpoint (GET /tasks/:id/export)
- Fix Content-Disposition header encoding for Chinese filenames
- Fix export field order to match template definition
- Export finalResult or resultA as fallback

API Integration Fixes (Phase 1-5):
- Fix API response parsing (return result.data consistently)
- Fix field name mismatch (fileKey -> sourceFileKey)
- Fix Excel parsing bug (range:99 -> slice(0,100))
- Add file upload with Excel parsing (columns, totalRows)
- Add detailed error logging for debugging

LLM Integration Fixes:
- Fix LLM call method: LLMFactory.createLLM -> getAdapter
- Fix adapter interface: generateText -> chat([messages])
- Fix response fields: text -> content, tokensUsed -> usage.totalTokens
- Fix model names: qwen-max -> qwen3-72b

React Infinite Loop Fixes:
- Step2: Remove updateState from useEffect deps
- Step3: Add useRef to prevent Strict Mode double execution
- Step3: Clear interval on API failure (max 3 retries)
- Step4: Add useRef to prevent infinite data loading
- Add cleanup functions to all useEffect hooks

Frontend Enhancements:
- Add comprehensive error handling with user-friendly messages
- Remove debug console.logs (production ready)
- Fix TypeScript type definitions (TaskProgress, ExtractionItem)
- Improve Step4Verify data transformation logic

Backend Enhancements:
- Add detailed logging at each step for debugging
- Add parameter validation in controllers
- Improve error messages with stack traces (dev mode)
- Add export field ordering by template definition

Documentation Updates:
- Update module status: Tool B MVP completed
- Create MVP completion summary (06-开发记录)
- Create technical debt document (07-技术债务)
- Update API documentation with test status
- Update database documentation with verified status
- Update system overview with DC module status
- Document 4 known issues (Excel preprocessing, progress display, etc.)

Testing Results:
- File upload: 9 rows parsed successfully
- Health check: Column validation working
- Dual model extraction: DeepSeek-V3 + Qwen-Max both working
- Processing time: ~49s for 9 records (~5s per record)
- Token usage: ~10k tokens total (~1.1k per record)
- Conflict detection: 1 clean, 8 conflicts (88.9% conflict rate)
- Excel export: Working with proper encoding

Files Changed:
Backend (~500 lines):
- ExtractionController.ts: Add upload endpoint, improve logging
- DualModelExtractionService.ts: Fix LLM call methods, add detailed logs
- HealthCheckService.ts: Fix Excel range parsing
- routes/index.ts: Add upload route

Frontend (~200 lines):
- toolB.ts: Fix API response parsing, add error handling
- Step1Upload.tsx: Integrate upload and health check APIs
- Step2Schema.tsx: Fix infinite loop, load templates from API
- Step3Processing.tsx: Fix infinite loop, integrate progress polling
- Step4Verify.tsx: Fix infinite loop, transform backend data correctly
- Step5Result.tsx: Integrate export API
- index.tsx: Add file metadata to state

Scripts:
- check-task-progress.mjs: Database inspection utility

Docs (~8 files):
- 00-模块当前状态与开发指南.md: Update to v2.0
- API设计文档.md: Mark all endpoints as tested
- 数据库设计文档.md: Update verification status
- DC模块Tool-B开发计划.md: Add MVP completion notice
- DC模块Tool-B开发任务清单.md: Update progress to 100%
- Tool-B-MVP完成总结.md: New completion summary
- Tool-B技术债务清单.md: New technical debt document
- 00-系统当前状态与开发指南.md: Update DC module status

Status: Tool B MVP complete and production ready
2025-12-03 15:07:39 +08:00

426 lines
12 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* DC模块 - 双模型提取服务
*
* 功能:
* - 并发调用DeepSeek-V3和Qwen-Max进行文本提取
* - PII脱敏处理
* - JSON解析与容错
* - Token统计
* - 异步任务管理
*
* 平台能力复用:
* - ✅ LLMFactory: LLM调用
* - ✅ jobQueue: 异步任务
* - ✅ logger: 日志记录
* - ✅ prisma: 数据库操作
*/
import { LLMFactory } from '../../../../common/llm/adapters/LLMFactory.js';
import { logger } from '../../../../common/logging/index.js';
import { prisma } from '../../../../config/database.js';
export interface ExtractionInput {
text: string;
fields: { name: string; desc: string }[];
promptTemplate: string;
}
export interface ExtractionOutput {
result: Record<string, string>;
tokensUsed: number;
rawOutput: any;
}
export class DualModelExtractionService {
/**
* 双模型并发提取
*
* @param input 提取输入
* @param taskId 任务ID
* @param itemId 记录ID
* @returns 双模型结果
*/
async extract(input: ExtractionInput, taskId: string, itemId: string): Promise<{
resultA: ExtractionOutput;
resultB: ExtractionOutput;
}> {
try {
logger.info('[DualExtraction] Starting extraction', { taskId, itemId });
// 1. PII脱敏
const maskedText = this.maskPII(input.text);
// 2. 构建Prompt
const prompt = this.buildPrompt(maskedText, input.fields, input.promptTemplate);
// 3. 并发调用两个模型DeepSeek & Qwen
const [resultA, resultB] = await Promise.allSettled([
this.callModel('deepseek', prompt, input.fields),
this.callModel('qwen', prompt, input.fields)
]);
// 4. 处理结果
if (resultA.status === 'rejected' || resultB.status === 'rejected') {
logger.error('[DualExtraction] One or both models failed', {
taskId,
itemId,
errorA: resultA.status === 'rejected' ? resultA.reason : null,
errorB: resultB.status === 'rejected' ? resultB.reason : null
});
throw new Error('Dual model extraction failed');
}
logger.info('[DualExtraction] Extraction completed', {
taskId,
itemId,
tokensA: resultA.value.tokensUsed,
tokensB: resultB.value.tokensUsed
});
return {
resultA: resultA.value,
resultB: resultB.value
};
} catch (error) {
logger.error('[DualExtraction] Extraction failed', { error, taskId, itemId });
throw error;
}
}
/**
* PII脱敏
*
* 使用正则表达式替换敏感信息:
* - 姓名:张**
* - 身份证号3301********1234
* - 手机号138****5678
*/
private maskPII(text: string): string {
let masked = text;
// 手机号脱敏138****5678
masked = masked.replace(/1[3-9]\d{9}/g, (match) => {
return match.substring(0, 3) + '****' + match.substring(7);
});
// 身份证号脱敏330102********1234
masked = masked.replace(/\d{6}(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}[\dxX]/g, (match) => {
return match.substring(0, 6) + '********' + match.substring(14);
});
// 简单的姓名脱敏匹配患者xxx、姓名xxx
masked = masked.replace(/(患者|姓名[:])\s*([^\s。,]{2,4})/g, (match, prefix, name) => {
if (name.length === 2) {
return prefix + name[0] + '*';
}
return prefix + name[0] + '*'.repeat(name.length - 1);
});
return masked;
}
/**
* 构建Prompt
*/
private buildPrompt(text: string, fields: { name: string; desc: string }[], template: string): string {
// 在模板末尾添加病历文本
return `${template}
**病历原文:**
${text}
请严格按照JSON格式输出不要有任何额外文字。`;
}
/**
* 调用单个模型
*/
private async callModel(
modelType: 'deepseek' | 'qwen',
prompt: string,
fields: { name: string; desc: string }[]
): Promise<ExtractionOutput> {
try {
// 🔑 使用LLMFactory获取适配器正确的方法
const modelName = modelType === 'deepseek' ? 'deepseek-v3' : 'qwen3-72b';
logger.info(`[${modelType.toUpperCase()}] Getting adapter`, { modelName });
const adapter = LLMFactory.getAdapter(modelName as any);
logger.info(`[${modelType.toUpperCase()}] Adapter created successfully`);
logger.info(`[${modelType.toUpperCase()}] Calling model with prompt`, {
modelName,
promptLength: prompt.length,
promptPreview: prompt.substring(0, 100) + '...'
});
// 🔑 调用LLM使用chat方法符合ILLMAdapter接口
const startTime = Date.now();
const response = await adapter.chat([
{ role: 'user', content: prompt }
], {
temperature: 0, // 最大确定性
maxTokens: 1000
});
const elapsedTime = Date.now() - startTime;
logger.info(`[${modelType.toUpperCase()}] Model responded successfully`, {
modelName,
tokensUsed: response.usage?.totalTokens,
elapsedMs: elapsedTime,
contentLength: response.content.length,
contentPreview: response.content.substring(0, 200)
});
// 解析JSON3层容错
logger.info(`[${modelType.toUpperCase()}] Parsing JSON response`);
const result = this.parseJSON(response.content, fields);
logger.info(`[${modelType.toUpperCase()}] JSON parsed successfully`, {
fieldCount: Object.keys(result).length
});
return {
result,
tokensUsed: response.usage?.totalTokens || 0,
rawOutput: response.content
};
} catch (error: any) {
logger.error(`[${modelType.toUpperCase()}] Model call failed`, {
error: error.message,
stack: error.stack,
modelType
});
throw error;
}
}
/**
* 解析JSON3层容错策略
*
* 1. 直接JSON.parse
* 2. 提取```json代码块
* 3. 提取{}内容
*/
private parseJSON(text: string, fields: { name: string; desc: string }[]): Record<string, string> {
// 策略1直接解析
try {
const parsed = JSON.parse(text);
if (this.validateFields(parsed, fields)) {
return parsed;
}
} catch (e) {
// 继续下一个策略
}
// 策略2提取```json代码块
const codeBlockMatch = text.match(/```json\s*\n([\s\S]*?)\n```/);
if (codeBlockMatch) {
try {
const parsed = JSON.parse(codeBlockMatch[1]);
if (this.validateFields(parsed, fields)) {
return parsed;
}
} catch (e) {
// 继续下一个策略
}
}
// 策略3提取第一个完整的{}对象
const objectMatch = text.match(/\{[\s\S]*\}/);
if (objectMatch) {
try {
const parsed = JSON.parse(objectMatch[0]);
if (this.validateFields(parsed, fields)) {
return parsed;
}
} catch (e) {
// 解析失败
}
}
// 所有策略失败,返回空对象
logger.warn('[JSON] All parse strategies failed', { text });
const emptyResult: Record<string, string> = {};
fields.forEach(f => {
emptyResult[f.name] = '解析失败';
});
return emptyResult;
}
/**
* 验证字段完整性
*/
private validateFields(parsed: any, fields: { name: string; desc: string }[]): boolean {
if (!parsed || typeof parsed !== 'object') {
return false;
}
// 检查所有必需字段是否存在
return fields.every(f => parsed.hasOwnProperty(f.name));
}
/**
* 批量提取(异步任务)
*
* @param taskId 任务ID
*/
async batchExtract(taskId: string): Promise<void> {
try {
logger.info('[Batch] ===== Starting batch extraction =====', { taskId });
// 1. 获取任务
logger.info('[Batch] Step 1: Fetching task from database', { taskId });
const task = await prisma.dCExtractionTask.findUnique({
where: { id: taskId },
include: { items: true }
});
if (!task) {
logger.error('[Batch] Task not found in database', { taskId });
throw new Error(`Task not found: ${taskId}`);
}
logger.info('[Batch] Task fetched successfully', {
taskId,
itemCount: task.items.length,
diseaseType: task.diseaseType,
reportType: task.reportType
});
// 2. 更新任务状态
await prisma.dCExtractionTask.update({
where: { id: taskId },
data: {
status: 'processing',
startedAt: new Date()
}
});
// 3. 获取模板
const template = await prisma.dCTemplate.findUnique({
where: {
diseaseType_reportType: {
diseaseType: task.diseaseType,
reportType: task.reportType
}
}
});
if (!template) {
throw new Error(`Template not found: ${task.diseaseType}/${task.reportType}`);
}
const fields = template.fields as { name: string; desc: string }[];
// 4. 逐条处理
let processedCount = 0;
let cleanCount = 0;
let conflictCount = 0;
let totalTokens = 0;
for (const item of task.items) {
try {
// 双模型提取
const { resultA, resultB } = await this.extract(
{
text: item.originalText,
fields,
promptTemplate: template.promptTemplate
},
taskId,
item.id
);
// 检测冲突由ConflictDetectionService处理这里暂时简单比较
const hasConflict = JSON.stringify(resultA.result) !== JSON.stringify(resultB.result);
// 更新记录
await prisma.dCExtractionItem.update({
where: { id: item.id },
data: {
resultA: resultA.result as any,
resultB: resultB.result as any,
tokensA: resultA.tokensUsed,
tokensB: resultB.tokensUsed,
status: hasConflict ? 'conflict' : 'clean',
finalResult: (hasConflict ? null : resultA.result) as any // 一致时自动采纳
}
});
processedCount++;
if (hasConflict) {
conflictCount++;
} else {
cleanCount++;
}
totalTokens += resultA.tokensUsed + resultB.tokensUsed;
// 更新任务进度
await prisma.dCExtractionTask.update({
where: { id: taskId },
data: {
processedCount,
cleanCount,
conflictCount,
totalTokens
}
});
} catch (error) {
logger.error('[Batch] Item extraction failed', { error, itemId: item.id });
await prisma.dCExtractionItem.update({
where: { id: item.id },
data: {
status: 'failed',
error: String(error)
}
});
}
}
// 5. 完成任务
await prisma.dCExtractionTask.update({
where: { id: taskId },
data: {
status: 'completed',
completedAt: new Date()
}
});
logger.info('[Batch] Batch extraction completed', {
taskId,
processedCount,
cleanCount,
conflictCount,
totalTokens
});
} catch (error) {
logger.error('[Batch] Batch extraction failed', { error, taskId });
// 更新任务为失败状态
await prisma.dCExtractionTask.update({
where: { id: taskId },
data: {
status: 'failed',
error: String(error)
}
});
throw error;
}
}
}
// 导出单例
export const dualModelExtractionService = new DualModelExtractionService();