docs(asl): Complete Tool 3 extraction workbench V2.0 development plan (v1.5)

ASL Tool 3 Development Plan:
- Architecture blueprint v1.5 (6 rounds of architecture review, 13 red lines)
- M1/M2/M3 sprint checklists (Skeleton Pipeline / HITL Workbench / Dynamic Template Engine)
- Code patterns cookbook (9 chapters: Fan-out, Prompt engineering, ACL, SSE dual-track, etc.)
- Key patterns: Fan-out with Last Child Wins, Optimistic Locking, teamConcurrency throttling
- PKB ACL integration (anti-corruption layer), MinerU Cache-Aside, NOTIFY/LISTEN cross-pod SSE
- Data consistency snapshot for long-running extraction tasks

Platform capability:
- Add distributed Fan-out task pattern development guide (7 patterns + 10 anti-patterns)
- Add system-level async architecture risk analysis blueprint
- Add PDF table extraction engine design and usage guide (MinerU integration)
- Add table extraction source code (TableExtractionManager + MinerU engine)

Documentation updates:
- Update ASL module status with Tool 3 V2.0 plan readiness
- Update system status document (v6.2) with latest milestones
- Add V2.0 product requirements, prototypes, and data dictionary specs
- Add architecture review documents (4 rounds of review feedback)
- Add test PDF files for extraction validation

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-02-23 22:49:16 +08:00
parent 8f06d4f929
commit dc6b292308
42 changed files with 16615 additions and 41 deletions

View File

@@ -0,0 +1,70 @@
/**
* PDF 表格提取引擎管理器
*
* 职责:引擎注册、默认引擎选择、统一调用入口。
* 使用者通过此管理器调用表格提取,无需关心底层引擎。
*/
import type {
ITableExtractionEngine,
ExtractionOptions,
ExtractionResult,
EngineType,
} from './types.js';
export class TableExtractionManager {
private engines = new Map<string, ITableExtractionEngine>();
private defaultEngineName: string;
constructor(defaultEngine: EngineType = 'mineru') {
this.defaultEngineName = defaultEngine;
}
/** 注册一个引擎适配器 */
register(engine: ITableExtractionEngine): void {
this.engines.set(engine.name, engine);
}
/** 设置默认引擎 */
setDefault(name: EngineType): void {
if (!this.engines.has(name)) {
throw new Error(
`[TableExtractionManager] Engine "${name}" not registered. Available: ${this.availableEngines().join(', ')}`,
);
}
this.defaultEngineName = name;
}
/** 获取已注册引擎列表 */
availableEngines(): string[] {
return Array.from(this.engines.keys());
}
/** 获取指定引擎实例 */
getEngine(name?: string): ITableExtractionEngine {
const key = name || this.defaultEngineName;
const engine = this.engines.get(key);
if (!engine) {
throw new Error(
`[TableExtractionManager] Engine "${key}" not registered. Available: ${this.availableEngines().join(', ') || 'none'}`,
);
}
return engine;
}
/**
* 提取表格 — 使用者唯一入口
*
* @param pdf PDF 文件 Buffer
* @param filename 文件名(含扩展名)
* @param options 可选配置engine 字段可覆盖默认引擎
*/
async extractTables(
pdf: Buffer,
filename: string,
options?: ExtractionOptions & { engine?: EngineType },
): Promise<ExtractionResult> {
const engine = this.getEngine(options?.engine);
return engine.extractTables(pdf, filename, options);
}
}

View File

@@ -0,0 +1,249 @@
/**
* MinerU Cloud API 引擎适配器
*
* 完整流程:请求上传 URL → 上传 PDF → 轮询解析状态 → 下载 ZIP → 解析 HTML 表格
*
* API 文档https://mineru.net/doc/docs/index_en/
* 免费额度2000 页/天 (vlm 模型)
*/
import axios from 'axios';
import AdmZip from 'adm-zip';
import type {
ITableExtractionEngine,
ExtractionOptions,
ExtractionResult,
} from '../types.js';
import { parseHtmlTablesFromMarkdown } from '../htmlTableParser.js';
// ─── 配置(延迟读取 process.env兼容 dotenv 加载时序)───────
function getEnv(key: string, fallback: string): string {
return process.env[key] || fallback;
}
const POLL_INTERVAL_MS = 5_000;
const POLL_MAX_ATTEMPTS = 120; // 最多等待 10 分钟
const REQUEST_TIMEOUT_MS = 30_000;
// ─── MinerU API 响应类型 ──────────────────────────────────────
interface BatchCreateResponse {
code: number;
msg: string;
data: {
batch_id: string;
file_urls: string[];
};
}
interface BatchResultResponse {
code: number;
msg: string;
data: {
extract_result: Array<{
data_id: string;
state: 'waiting' | 'processing' | 'done' | 'failed';
full_zip_url?: string;
err_msg?: string;
page_count?: number;
}>;
};
}
// ─── 引擎实现 ─────────────────────────────────────────────────
export class MinerUEngine implements ITableExtractionEngine {
readonly name = 'mineru';
readonly displayName = 'MinerU Cloud API (VLM)';
private apiBase: string;
private token: string;
private modelVersion: string;
constructor(options?: {
apiBase?: string;
token?: string;
modelVersion?: string;
}) {
this.apiBase = options?.apiBase || getEnv('MINERU_API_BASE', 'https://mineru.net/api/v4');
this.token = options?.token || getEnv('MINERU_API_TOKEN', '');
this.modelVersion = options?.modelVersion || getEnv('MINERU_MODEL_VERSION', 'vlm');
if (!this.token) {
throw new Error(
'[MinerUEngine] MINERU_API_TOKEN is required. Set it in .env or pass via constructor.',
);
}
}
async extractTables(
pdf: Buffer,
filename: string,
options?: ExtractionOptions,
): Promise<ExtractionResult> {
const startTime = Date.now();
const dataId = `extract_${Date.now()}`;
// Step 1: 请求预签名上传 URL
const { batchId, uploadUrl } = await this.requestUploadUrl(
filename,
dataId,
);
// Step 2: 上传 PDF
await this.uploadFile(uploadUrl, pdf);
// Step 3: 轮询等待解析完成
const result = await this.pollForResult(batchId, dataId);
// Step 4: 下载 ZIP 并解析
if (!result.full_zip_url) {
throw new Error(`[MinerUEngine] No result URL. State: ${result.state}`);
}
const markdown = await this.downloadAndExtract(result.full_zip_url);
// Step 5: 从 Markdown 中解析 HTML 表格
const tables = parseHtmlTablesFromMarkdown(markdown);
const duration = Date.now() - startTime;
return {
tables,
engine: this.name,
duration,
pageCount: result.page_count,
fullMarkdown: options?.keepRaw ? markdown : undefined,
};
}
// ─── 私有方法 ─────────────────────────────────────────────
private get headers() {
return {
Authorization: `Bearer ${this.token}`,
'Content-Type': 'application/json',
};
}
private async requestUploadUrl(
filename: string,
dataId: string,
): Promise<{ batchId: string; uploadUrl: string }> {
const url = `${this.apiBase}/file-urls/batch`;
const body = {
files: [{ name: filename, data_id: dataId }],
enable_table: true,
model_version: this.modelVersion,
};
let resp;
try {
resp = await axios.post<BatchCreateResponse>(url, body, {
headers: this.headers,
timeout: REQUEST_TIMEOUT_MS,
});
} catch (err: any) {
const status = err.response?.status;
const detail = err.response?.data
? JSON.stringify(err.response.data).substring(0, 500)
: err.message;
throw new Error(
`[MinerUEngine] Upload URL request failed (HTTP ${status}): ${detail}`,
);
}
if (resp.data.code !== 0) {
throw new Error(
`[MinerUEngine] Failed to get upload URL: ${resp.data.msg}`,
);
}
return {
batchId: resp.data.data.batch_id,
uploadUrl: resp.data.data.file_urls[0],
};
}
private async uploadFile(
uploadUrl: string,
pdf: Buffer,
): Promise<void> {
const resp = await fetch(uploadUrl, {
method: 'PUT',
body: pdf,
});
if (!resp.ok) {
const detail = await resp.text().catch(() => '');
throw new Error(
`[MinerUEngine] File upload failed (HTTP ${resp.status}): ${detail.substring(0, 500)}`,
);
}
}
private async pollForResult(
batchId: string,
dataId: string,
): Promise<BatchResultResponse['data']['extract_result'][0]> {
for (let attempt = 0; attempt < POLL_MAX_ATTEMPTS; attempt++) {
await sleep(POLL_INTERVAL_MS);
const resp = await axios.get<BatchResultResponse>(
`${this.apiBase}/extract-results/batch/${batchId}`,
{ headers: this.headers, timeout: REQUEST_TIMEOUT_MS },
);
if (resp.data.code !== 0) {
throw new Error(
`[MinerUEngine] Poll error: ${resp.data.msg}`,
);
}
const item = resp.data.data.extract_result.find(
(r) => r.data_id === dataId,
);
if (!item) continue;
if (item.state === 'done') return item;
if (item.state === 'failed') {
throw new Error(
`[MinerUEngine] Extraction failed: ${item.err_msg || 'unknown error'}`,
);
}
// 'waiting' / 'processing' → 继续轮询
}
throw new Error(
`[MinerUEngine] Polling timed out after ${(POLL_INTERVAL_MS * POLL_MAX_ATTEMPTS) / 1000}s`,
);
}
private async downloadAndExtract(zipUrl: string): Promise<string> {
const resp = await axios.get(zipUrl, {
responseType: 'arraybuffer',
timeout: 60_000,
});
const zip = new AdmZip(Buffer.from(resp.data));
const mdEntries = zip
.getEntries()
.filter((e) => e.entryName.endsWith('.md'));
if (mdEntries.length === 0) {
throw new Error('[MinerUEngine] No .md file found in result ZIP');
}
return mdEntries.map((e) => e.getData().toString('utf-8')).join('\n\n');
}
}
// ─── 工具 ─────────────────────────────────────────────────────
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}

View File

@@ -0,0 +1,150 @@
/**
* HTML <table> 解析器
*
* 将 MinerU 等引擎输出的 HTML 表格转换为统一的 ExtractedTable 结构。
* 纯正则 + 字符串处理无额外依赖Node.js 原生即可)。
*/
import type { ExtractedTable, MergedCell } from './types.js';
// ─── 公共 API ─────────────────────────────────────────────────
/**
* 从 Markdown 文本中提取所有 HTML <table> 并解析为 ExtractedTable[]
*/
export function parseHtmlTablesFromMarkdown(markdown: string): ExtractedTable[] {
const tables: ExtractedTable[] = [];
const tableRegex = /<table[\s\S]*?<\/table>/gi;
let match: RegExpExecArray | null;
while ((match = tableRegex.exec(markdown)) !== null) {
const rawHtml = match[0];
const title = extractTableTitle(markdown, match.index);
const parsed = parseOneHtmlTable(rawHtml, title);
tables.push(parsed);
}
return tables;
}
/**
* 解析单个 HTML <table> 字符串
*/
export function parseOneHtmlTable(html: string, title = ''): ExtractedTable {
const rows = extractRows(html);
const mergedCells: MergedCell[] = [];
const parsedRows: string[][] = [];
let maxCols = 0;
for (let rowIdx = 0; rowIdx < rows.length; rowIdx++) {
const cells = extractCells(rows[rowIdx]);
const rowData: string[] = [];
for (let colIdx = 0; colIdx < cells.length; colIdx++) {
const { text, rowSpan, colSpan } = cells[colIdx];
rowData.push(text);
if (rowSpan > 1 || colSpan > 1) {
mergedCells.push({ row: rowIdx, col: colIdx, rowSpan, colSpan });
}
}
parsedRows.push(rowData);
if (rowData.length > maxCols) maxCols = rowData.length;
}
// 将行列对齐到 maxCols短行补空字符串
for (const row of parsedRows) {
while (row.length < maxCols) row.push('');
}
// 第一行作为 header其余作为 data rows
const headers = parsedRows.length > 0 ? parsedRows[0] : [];
const dataRows = parsedRows.length > 1 ? parsedRows.slice(1) : [];
return {
title,
headers,
rows: dataRows,
mergedCells,
footnotes: [],
rawHtml: html,
};
}
// ─── 内部工具 ─────────────────────────────────────────────────
interface CellInfo {
text: string;
rowSpan: number;
colSpan: number;
}
function extractRows(tableHtml: string): string[] {
const rows: string[] = [];
const trRegex = /<tr[\s>][\s\S]*?<\/tr>/gi;
let m: RegExpExecArray | null;
while ((m = trRegex.exec(tableHtml)) !== null) {
rows.push(m[0]);
}
return rows;
}
function extractCells(trHtml: string): CellInfo[] {
const cells: CellInfo[] = [];
const cellRegex = /<(td|th)([\s\S]*?)>([\s\S]*?)<\/\1>/gi;
let m: RegExpExecArray | null;
while ((m = cellRegex.exec(trHtml)) !== null) {
const attrs = m[2];
const inner = m[3];
const text = stripHtml(inner).trim();
const rowSpan = parseIntAttr(attrs, 'rowspan');
const colSpan = parseIntAttr(attrs, 'colspan');
cells.push({ text, rowSpan, colSpan });
}
return cells;
}
function parseIntAttr(attrs: string, name: string): number {
const re = new RegExp(`${name}\\s*=\\s*["']?(\\d+)["']?`, 'i');
const m = re.exec(attrs);
return m ? parseInt(m[1], 10) : 1;
}
/** 移除 HTML 标签,解码常见实体 */
function stripHtml(html: string): string {
return html
.replace(/<br\s*\/?>/gi, ' ')
.replace(/<[^>]+>/g, '')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&nbsp;/g, ' ')
.replace(/&quot;/g, '"')
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
.replace(/\s+/g, ' ');
}
/**
* 从表格出现位置向前搜索标题。
* MinerU 通常在 <table> 前面紧跟 "Table N ..." 或 "**Table N ...**"。
*/
function extractTableTitle(markdown: string, tableOffset: number): string {
const before = markdown.substring(Math.max(0, tableOffset - 500), tableOffset);
const lines = before.split('\n').filter((l) => l.trim().length > 0);
for (let i = lines.length - 1; i >= Math.max(0, lines.length - 5); i--) {
const line = lines[i].trim();
const cleaned = line.replace(/\*\*/g, '').replace(/^#+\s*/, '');
if (/^table\s+\d/i.test(cleaned) || /^表\s*\d/i.test(cleaned)) {
return cleaned;
}
}
return '';
}

View File

@@ -0,0 +1,47 @@
/**
* PDF 表格提取引擎 — 统一导出
*
* 使用方式:
* import { tableExtraction } from '@/common/document/tableExtraction';
* const result = await tableExtraction.extractTables(pdfBuffer, 'paper.pdf');
* for (const table of result.tables) { ... }
*/
// ─── 类型导出 ─────────────────────────────────────────────────
export type {
ITableExtractionEngine,
ExtractionOptions,
ExtractionResult,
ExtractedTable,
MergedCell,
EngineType,
} from './types.js';
// ─── 类导出 ───────────────────────────────────────────────────
export { TableExtractionManager } from './TableExtractionManager.js';
export { MinerUEngine } from './engines/MinerUEngine.js';
export { parseHtmlTablesFromMarkdown, parseOneHtmlTable } from './htmlTableParser.js';
// ─── 全局单例 ─────────────────────────────────────────────────
import { TableExtractionManager } from './TableExtractionManager.js';
import { MinerUEngine } from './engines/MinerUEngine.js';
let _instance: TableExtractionManager | null = null;
/**
* 获取全局 TableExtractionManager 单例。
* 首次调用时自动注册 MinerU 引擎(需要 MINERU_API_TOKEN 环境变量)。
*/
export function getTableExtractionManager(): TableExtractionManager {
if (!_instance) {
_instance = new TableExtractionManager('mineru');
if (process.env.MINERU_API_TOKEN) {
_instance.register(new MinerUEngine());
}
}
return _instance;
}

View File

@@ -0,0 +1,89 @@
/**
* PDF 表格提取引擎 — 统一类型定义
*
* 核心原则:使用者只需提交 PDF获取 ExtractedTable[],无需关心底层引擎实现。
* 所有引擎适配器必须实现 ITableExtractionEngine 接口。
*/
// ─── 引擎接口 ───────────────────────────────────────────────
export interface ITableExtractionEngine {
/** 引擎唯一标识 (如 'mineru', 'qwen3vl', 'paddle') */
readonly name: string;
/** 引擎展示名 */
readonly displayName: string;
/** 从 PDF Buffer 提取表格 */
extractTables(
pdf: Buffer,
filename: string,
options?: ExtractionOptions,
): Promise<ExtractionResult>;
}
// ─── 选项与结果 ──────────────────────────────────────────────
export interface ExtractionOptions {
/** 语言提示,部分引擎可据此优化识别 */
language?: 'zh' | 'en' | 'auto';
/** 指定页码范围,如 [1,2,5] */
pages?: number[];
/** 是否保留原始 HTML / Markdown 输出 */
keepRaw?: boolean;
}
export interface ExtractionResult {
/** 提取出的表格列表 */
tables: ExtractedTable[];
/** 使用的引擎名称 */
engine: string;
/** 处理耗时 (ms) */
duration: number;
/** PDF 总页数 (部分引擎可返回) */
pageCount?: number;
/** 引擎返回的完整 Markdown (可选) */
fullMarkdown?: string;
}
// ─── 表格数据结构 ─────────────────────────────────────────────
export interface ExtractedTable {
/** 表格标题 (如 "Table 1. Baseline characteristics") */
title: string;
/** 表头 */
headers: string[];
/** 数据行 (二维数组,每行长度与 headers 一致) */
rows: string[][];
/** 合并单元格信息 */
mergedCells: MergedCell[];
/** 表格脚注 */
footnotes: string[];
/** 所在 PDF 页码 (1-based) */
pageNumber?: number;
/** 原始 HTML 片段 (MinerU 等引擎直接输出 HTML) */
rawHtml?: string;
/** 原始 Markdown 片段 */
rawMarkdown?: string;
}
export interface MergedCell {
/** 起始行号 (0-based) */
row: number;
/** 起始列号 (0-based) */
col: number;
/** 行跨度 */
rowSpan: number;
/** 列跨度 */
colSpan: number;
}
// ─── 引擎类型 ─────────────────────────────────────────────────
export type EngineType =
| 'mineru'
| 'qwen3vl'
| 'paddle'
| 'qwenocr'
| 'docling'
| 'deepseek';

View File

@@ -0,0 +1,149 @@
/**
* PDF 表格提取引擎集成测试
*
* 使用 MinerU Cloud API 从真实医学 PDF 中提取表格,验证引擎完整流程。
*
* 用法:
* npx tsx src/tests/test-table-extraction.ts <pdf文件路径>
*
* 示例:
* npx tsx src/tests/test-table-extraction.ts "../docs/03-业务模块/ASL-AI智能文献/05-测试文档/PDF/Herrschaft 2012.pdf"
*/
import { config } from 'dotenv';
config();
import fs from 'fs';
import path from 'path';
import {
getTableExtractionManager,
type ExtractedTable,
} from '../common/document/tableExtraction/index.js';
// ─── 主流程 ───────────────────────────────────────────────────
async function main() {
console.log('========================================');
console.log(' PDF 表格提取引擎 — 集成测试');
console.log('========================================\n');
// 1. 确定测试文件
const pdfPath = process.argv[2] || findDefaultTestPdf();
if (!pdfPath || !fs.existsSync(pdfPath)) {
console.error(`文件不存在: ${pdfPath || '(未指定)'}`);
console.log('\n用法: npx tsx src/tests/test-table-extraction.ts <pdf路径>');
process.exit(1);
}
const filename = path.basename(pdfPath);
const fileSize = fs.statSync(pdfPath).size;
console.log(`文件: ${filename}`);
console.log(`大小: ${(fileSize / 1024).toFixed(1)} KB`);
console.log(`路径: ${path.resolve(pdfPath)}\n`);
// 2. 检查环境变量
if (!process.env.MINERU_API_TOKEN) {
console.error('MINERU_API_TOKEN 未设置,请检查 backend/.env');
process.exit(1);
}
console.log('MINERU_API_TOKEN: ...已配置');
console.log(`MINERU_API_BASE: ${process.env.MINERU_API_BASE || 'https://mineru.net/api/v4'}`);
console.log(`MINERU_MODEL_VERSION: ${process.env.MINERU_MODEL_VERSION || 'vlm'}\n`);
// 3. 获取引擎管理器
const manager = getTableExtractionManager();
console.log(`已注册引擎: [${manager.availableEngines().join(', ')}]`);
console.log('默认引擎: mineru\n');
// 4. 执行提取
console.log('--- 开始提取 ---\n');
const pdfBuffer = fs.readFileSync(pdfPath);
try {
const result = await manager.extractTables(pdfBuffer, filename, {
keepRaw: true,
});
// 5. 输出结果
console.log('\n--- 提取完成 ---\n');
console.log(`引擎: ${result.engine}`);
console.log(`耗时: ${(result.duration / 1000).toFixed(1)}s`);
console.log(`PDF 页数: ${result.pageCount ?? '未知'}`);
console.log(`检出表格: ${result.tables.length}\n`);
if (result.tables.length === 0) {
console.log('未检出任何表格。');
}
for (let i = 0; i < result.tables.length; i++) {
printTable(i, result.tables[i]);
}
// 6. 保存完整 Markdown (可选)
if (result.fullMarkdown) {
const outDir = path.resolve(process.cwd(), 'test-output');
if (!fs.existsSync(outDir)) fs.mkdirSync(outDir, { recursive: true });
const mdPath = path.join(outDir, `${filename}.md`);
fs.writeFileSync(mdPath, result.fullMarkdown, 'utf-8');
console.log(`\n完整 Markdown 已保存: ${mdPath}`);
}
console.log('\n测试通过');
} catch (err: any) {
console.error('\n提取失败:', err.message);
process.exit(1);
}
}
// ─── 工具函数 ─────────────────────────────────────────────────
function printTable(index: number, table: ExtractedTable) {
console.log(`────────────────────────────────────────`);
console.log(`表格 ${index + 1}: ${table.title || '(无标题)'}`);
console.log(` 列数: ${table.headers.length}`);
console.log(` 行数: ${table.rows.length}`);
console.log(` 合并单元格: ${table.mergedCells.length}`);
if (table.pageNumber) console.log(` 页码: ${table.pageNumber}`);
// 打印表头
if (table.headers.length > 0) {
const headerPreview = table.headers
.map((h) => truncate(h, 20))
.join(' | ');
console.log(` 表头: ${headerPreview}`);
}
// 打印前 3 行数据
const previewRows = table.rows.slice(0, 3);
for (const row of previewRows) {
const rowPreview = row.map((c) => truncate(c, 20)).join(' | ');
console.log(` ${rowPreview}`);
}
if (table.rows.length > 3) {
console.log(` ... 还有 ${table.rows.length - 3}`);
}
console.log('');
}
function truncate(s: string, maxLen: number): string {
if (s.length <= maxLen) return s.padEnd(maxLen);
return s.substring(0, maxLen - 2) + '..';
}
function findDefaultTestPdf(): string | undefined {
const testDir = path.resolve(
process.cwd(),
'../docs/03-业务模块/ASL-AI智能文献/05-测试文档/PDF',
);
if (!fs.existsSync(testDir)) return undefined;
const files = fs.readdirSync(testDir).filter((f) => f.endsWith('.pdf'));
if (files.length === 0) return undefined;
return path.join(testDir, files[0]);
}
// ─── 执行 ─────────────────────────────────────────────────────
main().catch((err) => {
console.error('未捕获的错误:', err);
process.exit(1);
});