feat(asl/extraction): Complete Tool 3 M1+M2 - skeleton pipeline and HITL workbench

M1 Skeleton Pipeline:
- Scatter-dispatch + Aggregator polling pattern (PgBoss)
- PKB ACL bridge (PkbBridgeService -> PkbExportService DTOs)
- ExtractionSingleWorker with DeepSeek-V3 LLM extraction
- PermanentExtractionError for non-retryable failures
- Phantom Retry Guard (idempotent worker)
- 3-step minimal frontend (Setup -> Progress -> Workbench)
- 4 new DB tables (extraction_templates, project_templates, tasks, results)
- 3 system templates seed (RCT, Cohort, QC)
- M1 integration test suite

M2 HITL Workbench:
- MinerU VLM integration for high-fidelity table extraction
- XML-isolated DynamicPromptBuilder with flat JSON output template
- fuzzyQuoteMatch validator (3-tier confidence scoring)
- SSE real-time logging via ExtractionEventBus
- Schema-driven ExtractionDrawer (dynamic field rendering from template)
- Excel wide-table export with flattenModuleData normalization
- M2 integration test suite

Critical Fixes (data normalization):
- DynamicPromptBuilder: explicit flat key-value output format with example
- ExtractionExcelExporter: handle both array and flat data formats
- ExtractionDrawer: schema-driven rendering instead of hardcoded fields
- ExtractionValidator: array-format quote verification support
- SSE route: Fastify register encapsulation to bypass auth for EventSource
- LLM JSON sanitizer: strip illegal control chars before JSON.parse

Also includes: RVW stats verification spec, SSA expert config guide

Tested: M1 pipeline test + M2 HITL test + manual frontend verification
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-02-25 18:29:20 +08:00
parent 371fa53956
commit f0736dbca1
40 changed files with 6138 additions and 48 deletions

View File

@@ -0,0 +1,244 @@
import { FastifyRequest, FastifyReply } from 'fastify';
import { prisma } from '../../../../config/database.js';
import { templateService } from '../services/TemplateService.js';
import { extractionService } from '../services/ExtractionService.js';
import { pkbBridgeService } from '../services/PkbBridgeService.js';
import { extractionEventBus } from '../services/ExtractionEventBus.js';
import { extractionExcelExporter } from '../services/ExtractionExcelExporter.js';
import { logger } from '../../../../common/logging/index.js';
function getUserId(request: FastifyRequest): string {
const userId = (request as any).user?.userId;
if (!userId) throw new Error('User not authenticated');
return userId;
}
/**
* 工具 3 全文提取 API 控制器
*/
// ═══════════════════════════════════════════
// 模板 API
// ═══════════════════════════════════════════
export async function listTemplates(request: FastifyRequest, reply: FastifyReply) {
const templates = await templateService.listSystemTemplates();
return reply.send({ success: true, data: templates });
}
export async function getTemplate(
request: FastifyRequest<{ Params: { templateId: string } }>,
reply: FastifyReply,
) {
const template = await templateService.getSystemTemplate(request.params.templateId);
return reply.send({ success: true, data: template });
}
export async function cloneTemplate(
request: FastifyRequest<{ Body: { projectId: string; baseTemplateId: string } }>,
reply: FastifyReply,
) {
const userId = getUserId(request);
const { projectId, baseTemplateId } = request.body;
const projectTemplate = await templateService.cloneToProject(projectId, baseTemplateId, userId);
return reply.send({ success: true, data: projectTemplate });
}
// ═══════════════════════════════════════════
// 提取任务 API
// ═══════════════════════════════════════════
export async function createTask(
request: FastifyRequest<{
Body: {
projectId: string;
projectTemplateId: string;
pkbKnowledgeBaseId: string;
documentIds: string[];
idempotencyKey?: string;
};
}>,
reply: FastifyReply,
) {
const userId = getUserId(request);
const { projectId, projectTemplateId, pkbKnowledgeBaseId, documentIds, idempotencyKey } = request.body;
const result = await extractionService.createTask({
projectId,
userId,
projectTemplateId,
pkbKnowledgeBaseId,
documentIds,
idempotencyKey,
pkbBridge: pkbBridgeService,
});
return reply.send({ success: true, ...result });
}
export async function getTaskStatus(
request: FastifyRequest<{ Params: { taskId: string } }>,
reply: FastifyReply,
) {
const status = await extractionService.getTaskStatus(request.params.taskId);
return reply.send({ success: true, data: status });
}
export async function getTaskResults(
request: FastifyRequest<{ Params: { taskId: string } }>,
reply: FastifyReply,
) {
const results = await extractionService.getResults(request.params.taskId);
return reply.send({ success: true, data: results });
}
// ═══════════════════════════════════════════
// PKB 数据代理 API前端通过 ASL 访问,不直接调 PKB
// ═══════════════════════════════════════════
export async function listKnowledgeBases(
request: FastifyRequest,
reply: FastifyReply,
) {
const userId = getUserId(request);
const kbs = await pkbBridgeService.listKnowledgeBases(userId);
return reply.send({ success: true, data: kbs });
}
export async function listDocuments(
request: FastifyRequest<{ Params: { kbId: string } }>,
reply: FastifyReply,
) {
const docs = await pkbBridgeService.listPdfDocuments(request.params.kbId);
return reply.send({ success: true, data: docs });
}
// ═══════════════════════════════════════════
// 单条提取结果详情 + 审核 APIM2 新增)
// ═══════════════════════════════════════════
export async function getResultDetail(
request: FastifyRequest<{ Params: { resultId: string } }>,
reply: FastifyReply,
) {
const result = await prisma.aslExtractionResult.findUnique({
where: { id: request.params.resultId },
include: {
task: {
select: {
projectTemplate: {
include: { baseTemplate: true },
},
},
},
},
});
if (!result) {
return reply.status(404).send({ success: false, error: 'Result not found' });
}
const baseFields = result.task?.projectTemplate?.baseTemplate?.baseFields as Record<string, any[]> | undefined;
const outcomeType = result.task?.projectTemplate?.outcomeType || 'survival';
// Build schema (filtered by outcomeType, same logic as TemplateService.assembleFullSchema)
let schema: Record<string, any[]> | undefined;
if (baseFields) {
schema = {};
for (const [mod, fields] of Object.entries(baseFields)) {
if (mod.startsWith('outcomes_') && mod !== `outcomes_${outcomeType}`) continue;
schema[mod] = fields;
}
}
return reply.send({
success: true,
data: {
id: result.id,
pkbDocumentId: result.pkbDocumentId,
snapshotFilename: result.snapshotFilename,
snapshotStorageKey: result.snapshotStorageKey,
status: result.status,
reviewStatus: result.reviewStatus,
extractedData: result.extractedData,
quoteVerification: result.quoteVerification,
errorMessage: result.errorMessage,
processedAt: result.processedAt,
createdAt: result.createdAt,
schema,
outcomeType,
},
});
}
export async function reviewResult(
request: FastifyRequest<{
Params: { resultId: string };
Body: { reviewStatus: 'approved' | 'rejected' };
}>,
reply: FastifyReply,
) {
const { reviewStatus } = request.body;
const updated = await prisma.aslExtractionResult.update({
where: { id: request.params.resultId },
data: {
reviewStatus,
reviewedAt: new Date(),
},
});
return reply.send({ success: true, data: updated });
}
// ═══════════════════════════════════════════
// SSE 日志流端点M2 新增)
// ═══════════════════════════════════════════
export async function streamTaskLogs(
request: FastifyRequest<{ Params: { taskId: string }; Querystring: { token?: string } }>,
reply: FastifyReply,
) {
const { taskId } = request.params;
reply.raw.writeHead(200, {
'Content-Type': 'text/event-stream',
'Cache-Control': 'no-cache',
Connection: 'keep-alive',
'X-Accel-Buffering': 'no',
});
// 首帧同步历史日志
const recentLogs = extractionEventBus.getRecentLogs(taskId);
reply.raw.write(`event: sync\ndata: ${JSON.stringify({ logs: recentLogs })}\n\n`);
// 订阅实时日志
const unsubscribe = extractionEventBus.subscribe(taskId, (entry) => {
reply.raw.write(`event: log\ndata: ${JSON.stringify(entry)}\n\n`);
});
// 心跳防断(每 15 秒)
const heartbeat = setInterval(() => {
reply.raw.write(':heartbeat\n\n');
}, 15_000);
// 客户端断开清理
request.raw.on('close', () => {
clearInterval(heartbeat);
unsubscribe();
});
}
// ═══════════════════════════════════════════
// Excel 导出端点M2 新增)
// ═══════════════════════════════════════════
export async function exportTaskResults(
request: FastifyRequest<{ Params: { taskId: string } }>,
reply: FastifyReply,
) {
const { taskId } = request.params;
const buffer = await extractionExcelExporter.exportToExcel(taskId);
reply.header('Content-Type', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet');
reply.header('Content-Disposition', `attachment; filename="extraction-${taskId}.xlsx"`);
return reply.send(buffer);
}