docs(asl): Complete Tool 3 extraction workbench V2.0 development plan (v1.5)
ASL Tool 3 Development Plan: - Architecture blueprint v1.5 (6 rounds of architecture review, 13 red lines) - M1/M2/M3 sprint checklists (Skeleton Pipeline / HITL Workbench / Dynamic Template Engine) - Code patterns cookbook (9 chapters: Fan-out, Prompt engineering, ACL, SSE dual-track, etc.) - Key patterns: Fan-out with Last Child Wins, Optimistic Locking, teamConcurrency throttling - PKB ACL integration (anti-corruption layer), MinerU Cache-Aside, NOTIFY/LISTEN cross-pod SSE - Data consistency snapshot for long-running extraction tasks Platform capability: - Add distributed Fan-out task pattern development guide (7 patterns + 10 anti-patterns) - Add system-level async architecture risk analysis blueprint - Add PDF table extraction engine design and usage guide (MinerU integration) - Add table extraction source code (TableExtractionManager + MinerU engine) Documentation updates: - Update ASL module status with Tool 3 V2.0 plan readiness - Update system status document (v6.2) with latest milestones - Add V2.0 product requirements, prototypes, and data dictionary specs - Add architecture review documents (4 rounds of review feedback) - Add test PDF files for extraction validation Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
21
backend/package-lock.json
generated
21
backend/package-lock.json
generated
@@ -15,6 +15,7 @@
|
||||
"@prisma/client": "^6.17.0",
|
||||
"@types/form-data": "^2.2.1",
|
||||
"@wecom/crypto": "^1.0.1",
|
||||
"adm-zip": "^0.5.16",
|
||||
"ajv": "^8.17.1",
|
||||
"ali-oss": "^6.23.0",
|
||||
"axios": "^1.12.2",
|
||||
@@ -43,6 +44,7 @@
|
||||
"zod": "^4.1.12"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/adm-zip": "^0.5.7",
|
||||
"@types/ali-oss": "^6.23.1",
|
||||
"@types/bcryptjs": "^2.4.6",
|
||||
"@types/js-yaml": "^4.0.9",
|
||||
@@ -1028,6 +1030,16 @@
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/adm-zip": {
|
||||
"version": "0.5.7",
|
||||
"resolved": "https://registry.npmmirror.com/@types/adm-zip/-/adm-zip-0.5.7.tgz",
|
||||
"integrity": "sha512-DNEs/QvmyRLurdQPChqq0Md4zGvPwHerAJYWk9l2jCbD1VPpnzRJorOdiq4zsw09NFbYnhfsoEhWtxIzXpn2yw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/ali-oss": {
|
||||
"version": "6.23.1",
|
||||
"resolved": "https://registry.npmmirror.com/@types/ali-oss/-/ali-oss-6.23.1.tgz",
|
||||
@@ -1202,6 +1214,15 @@
|
||||
"node": ">=0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/adm-zip": {
|
||||
"version": "0.5.16",
|
||||
"resolved": "https://registry.npmmirror.com/adm-zip/-/adm-zip-0.5.16.tgz",
|
||||
"integrity": "sha512-TGw5yVi4saajsSEgz25grObGHEUaDrniwvA2qwSC060KfqGPdglhvPMA2lPIoxs3PQIItj2iag35fONcQqgUaQ==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=12.0"
|
||||
}
|
||||
},
|
||||
"node_modules/agentkeepalive": {
|
||||
"version": "3.5.3",
|
||||
"resolved": "https://registry.npmmirror.com/agentkeepalive/-/agentkeepalive-3.5.3.tgz",
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
"@prisma/client": "^6.17.0",
|
||||
"@types/form-data": "^2.2.1",
|
||||
"@wecom/crypto": "^1.0.1",
|
||||
"adm-zip": "^0.5.16",
|
||||
"ajv": "^8.17.1",
|
||||
"ali-oss": "^6.23.0",
|
||||
"axios": "^1.12.2",
|
||||
@@ -60,6 +61,7 @@
|
||||
"zod": "^4.1.12"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/adm-zip": "^0.5.7",
|
||||
"@types/ali-oss": "^6.23.1",
|
||||
"@types/bcryptjs": "^2.4.6",
|
||||
"@types/js-yaml": "^4.0.9",
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
/**
|
||||
* PDF 表格提取引擎管理器
|
||||
*
|
||||
* 职责:引擎注册、默认引擎选择、统一调用入口。
|
||||
* 使用者通过此管理器调用表格提取,无需关心底层引擎。
|
||||
*/
|
||||
|
||||
import type {
|
||||
ITableExtractionEngine,
|
||||
ExtractionOptions,
|
||||
ExtractionResult,
|
||||
EngineType,
|
||||
} from './types.js';
|
||||
|
||||
export class TableExtractionManager {
|
||||
private engines = new Map<string, ITableExtractionEngine>();
|
||||
private defaultEngineName: string;
|
||||
|
||||
constructor(defaultEngine: EngineType = 'mineru') {
|
||||
this.defaultEngineName = defaultEngine;
|
||||
}
|
||||
|
||||
/** 注册一个引擎适配器 */
|
||||
register(engine: ITableExtractionEngine): void {
|
||||
this.engines.set(engine.name, engine);
|
||||
}
|
||||
|
||||
/** 设置默认引擎 */
|
||||
setDefault(name: EngineType): void {
|
||||
if (!this.engines.has(name)) {
|
||||
throw new Error(
|
||||
`[TableExtractionManager] Engine "${name}" not registered. Available: ${this.availableEngines().join(', ')}`,
|
||||
);
|
||||
}
|
||||
this.defaultEngineName = name;
|
||||
}
|
||||
|
||||
/** 获取已注册引擎列表 */
|
||||
availableEngines(): string[] {
|
||||
return Array.from(this.engines.keys());
|
||||
}
|
||||
|
||||
/** 获取指定引擎实例 */
|
||||
getEngine(name?: string): ITableExtractionEngine {
|
||||
const key = name || this.defaultEngineName;
|
||||
const engine = this.engines.get(key);
|
||||
if (!engine) {
|
||||
throw new Error(
|
||||
`[TableExtractionManager] Engine "${key}" not registered. Available: ${this.availableEngines().join(', ') || 'none'}`,
|
||||
);
|
||||
}
|
||||
return engine;
|
||||
}
|
||||
|
||||
/**
|
||||
* 提取表格 — 使用者唯一入口
|
||||
*
|
||||
* @param pdf PDF 文件 Buffer
|
||||
* @param filename 文件名(含扩展名)
|
||||
* @param options 可选配置,engine 字段可覆盖默认引擎
|
||||
*/
|
||||
async extractTables(
|
||||
pdf: Buffer,
|
||||
filename: string,
|
||||
options?: ExtractionOptions & { engine?: EngineType },
|
||||
): Promise<ExtractionResult> {
|
||||
const engine = this.getEngine(options?.engine);
|
||||
return engine.extractTables(pdf, filename, options);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,249 @@
|
||||
/**
|
||||
* MinerU Cloud API 引擎适配器
|
||||
*
|
||||
* 完整流程:请求上传 URL → 上传 PDF → 轮询解析状态 → 下载 ZIP → 解析 HTML 表格
|
||||
*
|
||||
* API 文档:https://mineru.net/doc/docs/index_en/
|
||||
* 免费额度:2000 页/天 (vlm 模型)
|
||||
*/
|
||||
|
||||
import axios from 'axios';
|
||||
import AdmZip from 'adm-zip';
|
||||
import type {
|
||||
ITableExtractionEngine,
|
||||
ExtractionOptions,
|
||||
ExtractionResult,
|
||||
} from '../types.js';
|
||||
import { parseHtmlTablesFromMarkdown } from '../htmlTableParser.js';
|
||||
|
||||
// ─── 配置(延迟读取 process.env,兼容 dotenv 加载时序)───────
|
||||
|
||||
function getEnv(key: string, fallback: string): string {
|
||||
return process.env[key] || fallback;
|
||||
}
|
||||
|
||||
const POLL_INTERVAL_MS = 5_000;
|
||||
const POLL_MAX_ATTEMPTS = 120; // 最多等待 10 分钟
|
||||
const REQUEST_TIMEOUT_MS = 30_000;
|
||||
|
||||
// ─── MinerU API 响应类型 ──────────────────────────────────────
|
||||
|
||||
interface BatchCreateResponse {
|
||||
code: number;
|
||||
msg: string;
|
||||
data: {
|
||||
batch_id: string;
|
||||
file_urls: string[];
|
||||
};
|
||||
}
|
||||
|
||||
interface BatchResultResponse {
|
||||
code: number;
|
||||
msg: string;
|
||||
data: {
|
||||
extract_result: Array<{
|
||||
data_id: string;
|
||||
state: 'waiting' | 'processing' | 'done' | 'failed';
|
||||
full_zip_url?: string;
|
||||
err_msg?: string;
|
||||
page_count?: number;
|
||||
}>;
|
||||
};
|
||||
}
|
||||
|
||||
// ─── 引擎实现 ─────────────────────────────────────────────────
|
||||
|
||||
export class MinerUEngine implements ITableExtractionEngine {
|
||||
readonly name = 'mineru';
|
||||
readonly displayName = 'MinerU Cloud API (VLM)';
|
||||
|
||||
private apiBase: string;
|
||||
private token: string;
|
||||
private modelVersion: string;
|
||||
|
||||
constructor(options?: {
|
||||
apiBase?: string;
|
||||
token?: string;
|
||||
modelVersion?: string;
|
||||
}) {
|
||||
this.apiBase = options?.apiBase || getEnv('MINERU_API_BASE', 'https://mineru.net/api/v4');
|
||||
this.token = options?.token || getEnv('MINERU_API_TOKEN', '');
|
||||
this.modelVersion = options?.modelVersion || getEnv('MINERU_MODEL_VERSION', 'vlm');
|
||||
|
||||
if (!this.token) {
|
||||
throw new Error(
|
||||
'[MinerUEngine] MINERU_API_TOKEN is required. Set it in .env or pass via constructor.',
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
async extractTables(
|
||||
pdf: Buffer,
|
||||
filename: string,
|
||||
options?: ExtractionOptions,
|
||||
): Promise<ExtractionResult> {
|
||||
const startTime = Date.now();
|
||||
const dataId = `extract_${Date.now()}`;
|
||||
|
||||
// Step 1: 请求预签名上传 URL
|
||||
const { batchId, uploadUrl } = await this.requestUploadUrl(
|
||||
filename,
|
||||
dataId,
|
||||
);
|
||||
|
||||
// Step 2: 上传 PDF
|
||||
await this.uploadFile(uploadUrl, pdf);
|
||||
|
||||
// Step 3: 轮询等待解析完成
|
||||
const result = await this.pollForResult(batchId, dataId);
|
||||
|
||||
// Step 4: 下载 ZIP 并解析
|
||||
if (!result.full_zip_url) {
|
||||
throw new Error(`[MinerUEngine] No result URL. State: ${result.state}`);
|
||||
}
|
||||
|
||||
const markdown = await this.downloadAndExtract(result.full_zip_url);
|
||||
|
||||
// Step 5: 从 Markdown 中解析 HTML 表格
|
||||
const tables = parseHtmlTablesFromMarkdown(markdown);
|
||||
|
||||
const duration = Date.now() - startTime;
|
||||
|
||||
return {
|
||||
tables,
|
||||
engine: this.name,
|
||||
duration,
|
||||
pageCount: result.page_count,
|
||||
fullMarkdown: options?.keepRaw ? markdown : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
// ─── 私有方法 ─────────────────────────────────────────────
|
||||
|
||||
private get headers() {
|
||||
return {
|
||||
Authorization: `Bearer ${this.token}`,
|
||||
'Content-Type': 'application/json',
|
||||
};
|
||||
}
|
||||
|
||||
private async requestUploadUrl(
|
||||
filename: string,
|
||||
dataId: string,
|
||||
): Promise<{ batchId: string; uploadUrl: string }> {
|
||||
const url = `${this.apiBase}/file-urls/batch`;
|
||||
const body = {
|
||||
files: [{ name: filename, data_id: dataId }],
|
||||
enable_table: true,
|
||||
model_version: this.modelVersion,
|
||||
};
|
||||
|
||||
let resp;
|
||||
try {
|
||||
resp = await axios.post<BatchCreateResponse>(url, body, {
|
||||
headers: this.headers,
|
||||
timeout: REQUEST_TIMEOUT_MS,
|
||||
});
|
||||
} catch (err: any) {
|
||||
const status = err.response?.status;
|
||||
const detail = err.response?.data
|
||||
? JSON.stringify(err.response.data).substring(0, 500)
|
||||
: err.message;
|
||||
throw new Error(
|
||||
`[MinerUEngine] Upload URL request failed (HTTP ${status}): ${detail}`,
|
||||
);
|
||||
}
|
||||
|
||||
if (resp.data.code !== 0) {
|
||||
throw new Error(
|
||||
`[MinerUEngine] Failed to get upload URL: ${resp.data.msg}`,
|
||||
);
|
||||
}
|
||||
|
||||
return {
|
||||
batchId: resp.data.data.batch_id,
|
||||
uploadUrl: resp.data.data.file_urls[0],
|
||||
};
|
||||
}
|
||||
|
||||
private async uploadFile(
|
||||
uploadUrl: string,
|
||||
pdf: Buffer,
|
||||
): Promise<void> {
|
||||
const resp = await fetch(uploadUrl, {
|
||||
method: 'PUT',
|
||||
body: pdf,
|
||||
});
|
||||
|
||||
if (!resp.ok) {
|
||||
const detail = await resp.text().catch(() => '');
|
||||
throw new Error(
|
||||
`[MinerUEngine] File upload failed (HTTP ${resp.status}): ${detail.substring(0, 500)}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
private async pollForResult(
|
||||
batchId: string,
|
||||
dataId: string,
|
||||
): Promise<BatchResultResponse['data']['extract_result'][0]> {
|
||||
for (let attempt = 0; attempt < POLL_MAX_ATTEMPTS; attempt++) {
|
||||
await sleep(POLL_INTERVAL_MS);
|
||||
|
||||
const resp = await axios.get<BatchResultResponse>(
|
||||
`${this.apiBase}/extract-results/batch/${batchId}`,
|
||||
{ headers: this.headers, timeout: REQUEST_TIMEOUT_MS },
|
||||
);
|
||||
|
||||
if (resp.data.code !== 0) {
|
||||
throw new Error(
|
||||
`[MinerUEngine] Poll error: ${resp.data.msg}`,
|
||||
);
|
||||
}
|
||||
|
||||
const item = resp.data.data.extract_result.find(
|
||||
(r) => r.data_id === dataId,
|
||||
);
|
||||
|
||||
if (!item) continue;
|
||||
|
||||
if (item.state === 'done') return item;
|
||||
|
||||
if (item.state === 'failed') {
|
||||
throw new Error(
|
||||
`[MinerUEngine] Extraction failed: ${item.err_msg || 'unknown error'}`,
|
||||
);
|
||||
}
|
||||
|
||||
// 'waiting' / 'processing' → 继续轮询
|
||||
}
|
||||
|
||||
throw new Error(
|
||||
`[MinerUEngine] Polling timed out after ${(POLL_INTERVAL_MS * POLL_MAX_ATTEMPTS) / 1000}s`,
|
||||
);
|
||||
}
|
||||
|
||||
private async downloadAndExtract(zipUrl: string): Promise<string> {
|
||||
const resp = await axios.get(zipUrl, {
|
||||
responseType: 'arraybuffer',
|
||||
timeout: 60_000,
|
||||
});
|
||||
|
||||
const zip = new AdmZip(Buffer.from(resp.data));
|
||||
const mdEntries = zip
|
||||
.getEntries()
|
||||
.filter((e) => e.entryName.endsWith('.md'));
|
||||
|
||||
if (mdEntries.length === 0) {
|
||||
throw new Error('[MinerUEngine] No .md file found in result ZIP');
|
||||
}
|
||||
|
||||
return mdEntries.map((e) => e.getData().toString('utf-8')).join('\n\n');
|
||||
}
|
||||
}
|
||||
|
||||
// ─── 工具 ─────────────────────────────────────────────────────
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||
}
|
||||
150
backend/src/common/document/tableExtraction/htmlTableParser.ts
Normal file
150
backend/src/common/document/tableExtraction/htmlTableParser.ts
Normal file
@@ -0,0 +1,150 @@
|
||||
/**
|
||||
* HTML <table> 解析器
|
||||
*
|
||||
* 将 MinerU 等引擎输出的 HTML 表格转换为统一的 ExtractedTable 结构。
|
||||
* 纯正则 + 字符串处理,无额外依赖(Node.js 原生即可)。
|
||||
*/
|
||||
|
||||
import type { ExtractedTable, MergedCell } from './types.js';
|
||||
|
||||
// ─── 公共 API ─────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* 从 Markdown 文本中提取所有 HTML <table> 并解析为 ExtractedTable[]
|
||||
*/
|
||||
export function parseHtmlTablesFromMarkdown(markdown: string): ExtractedTable[] {
|
||||
const tables: ExtractedTable[] = [];
|
||||
const tableRegex = /<table[\s\S]*?<\/table>/gi;
|
||||
let match: RegExpExecArray | null;
|
||||
|
||||
while ((match = tableRegex.exec(markdown)) !== null) {
|
||||
const rawHtml = match[0];
|
||||
const title = extractTableTitle(markdown, match.index);
|
||||
const parsed = parseOneHtmlTable(rawHtml, title);
|
||||
tables.push(parsed);
|
||||
}
|
||||
|
||||
return tables;
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析单个 HTML <table> 字符串
|
||||
*/
|
||||
export function parseOneHtmlTable(html: string, title = ''): ExtractedTable {
|
||||
const rows = extractRows(html);
|
||||
const mergedCells: MergedCell[] = [];
|
||||
|
||||
const parsedRows: string[][] = [];
|
||||
let maxCols = 0;
|
||||
|
||||
for (let rowIdx = 0; rowIdx < rows.length; rowIdx++) {
|
||||
const cells = extractCells(rows[rowIdx]);
|
||||
const rowData: string[] = [];
|
||||
|
||||
for (let colIdx = 0; colIdx < cells.length; colIdx++) {
|
||||
const { text, rowSpan, colSpan } = cells[colIdx];
|
||||
rowData.push(text);
|
||||
|
||||
if (rowSpan > 1 || colSpan > 1) {
|
||||
mergedCells.push({ row: rowIdx, col: colIdx, rowSpan, colSpan });
|
||||
}
|
||||
}
|
||||
|
||||
parsedRows.push(rowData);
|
||||
if (rowData.length > maxCols) maxCols = rowData.length;
|
||||
}
|
||||
|
||||
// 将行列对齐到 maxCols(短行补空字符串)
|
||||
for (const row of parsedRows) {
|
||||
while (row.length < maxCols) row.push('');
|
||||
}
|
||||
|
||||
// 第一行作为 header,其余作为 data rows
|
||||
const headers = parsedRows.length > 0 ? parsedRows[0] : [];
|
||||
const dataRows = parsedRows.length > 1 ? parsedRows.slice(1) : [];
|
||||
|
||||
return {
|
||||
title,
|
||||
headers,
|
||||
rows: dataRows,
|
||||
mergedCells,
|
||||
footnotes: [],
|
||||
rawHtml: html,
|
||||
};
|
||||
}
|
||||
|
||||
// ─── 内部工具 ─────────────────────────────────────────────────
|
||||
|
||||
interface CellInfo {
|
||||
text: string;
|
||||
rowSpan: number;
|
||||
colSpan: number;
|
||||
}
|
||||
|
||||
function extractRows(tableHtml: string): string[] {
|
||||
const rows: string[] = [];
|
||||
const trRegex = /<tr[\s>][\s\S]*?<\/tr>/gi;
|
||||
let m: RegExpExecArray | null;
|
||||
while ((m = trRegex.exec(tableHtml)) !== null) {
|
||||
rows.push(m[0]);
|
||||
}
|
||||
return rows;
|
||||
}
|
||||
|
||||
function extractCells(trHtml: string): CellInfo[] {
|
||||
const cells: CellInfo[] = [];
|
||||
const cellRegex = /<(td|th)([\s\S]*?)>([\s\S]*?)<\/\1>/gi;
|
||||
let m: RegExpExecArray | null;
|
||||
|
||||
while ((m = cellRegex.exec(trHtml)) !== null) {
|
||||
const attrs = m[2];
|
||||
const inner = m[3];
|
||||
|
||||
const text = stripHtml(inner).trim();
|
||||
const rowSpan = parseIntAttr(attrs, 'rowspan');
|
||||
const colSpan = parseIntAttr(attrs, 'colspan');
|
||||
|
||||
cells.push({ text, rowSpan, colSpan });
|
||||
}
|
||||
|
||||
return cells;
|
||||
}
|
||||
|
||||
function parseIntAttr(attrs: string, name: string): number {
|
||||
const re = new RegExp(`${name}\\s*=\\s*["']?(\\d+)["']?`, 'i');
|
||||
const m = re.exec(attrs);
|
||||
return m ? parseInt(m[1], 10) : 1;
|
||||
}
|
||||
|
||||
/** 移除 HTML 标签,解码常见实体 */
|
||||
function stripHtml(html: string): string {
|
||||
return html
|
||||
.replace(/<br\s*\/?>/gi, ' ')
|
||||
.replace(/<[^>]+>/g, '')
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/ /g, ' ')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
|
||||
.replace(/\s+/g, ' ');
|
||||
}
|
||||
|
||||
/**
|
||||
* 从表格出现位置向前搜索标题。
|
||||
* MinerU 通常在 <table> 前面紧跟 "Table N ..." 或 "**Table N ...**"。
|
||||
*/
|
||||
function extractTableTitle(markdown: string, tableOffset: number): string {
|
||||
const before = markdown.substring(Math.max(0, tableOffset - 500), tableOffset);
|
||||
const lines = before.split('\n').filter((l) => l.trim().length > 0);
|
||||
|
||||
for (let i = lines.length - 1; i >= Math.max(0, lines.length - 5); i--) {
|
||||
const line = lines[i].trim();
|
||||
const cleaned = line.replace(/\*\*/g, '').replace(/^#+\s*/, '');
|
||||
if (/^table\s+\d/i.test(cleaned) || /^表\s*\d/i.test(cleaned)) {
|
||||
return cleaned;
|
||||
}
|
||||
}
|
||||
|
||||
return '';
|
||||
}
|
||||
47
backend/src/common/document/tableExtraction/index.ts
Normal file
47
backend/src/common/document/tableExtraction/index.ts
Normal file
@@ -0,0 +1,47 @@
|
||||
/**
|
||||
* PDF 表格提取引擎 — 统一导出
|
||||
*
|
||||
* 使用方式:
|
||||
* import { tableExtraction } from '@/common/document/tableExtraction';
|
||||
* const result = await tableExtraction.extractTables(pdfBuffer, 'paper.pdf');
|
||||
* for (const table of result.tables) { ... }
|
||||
*/
|
||||
|
||||
// ─── 类型导出 ─────────────────────────────────────────────────
|
||||
|
||||
export type {
|
||||
ITableExtractionEngine,
|
||||
ExtractionOptions,
|
||||
ExtractionResult,
|
||||
ExtractedTable,
|
||||
MergedCell,
|
||||
EngineType,
|
||||
} from './types.js';
|
||||
|
||||
// ─── 类导出 ───────────────────────────────────────────────────
|
||||
|
||||
export { TableExtractionManager } from './TableExtractionManager.js';
|
||||
export { MinerUEngine } from './engines/MinerUEngine.js';
|
||||
export { parseHtmlTablesFromMarkdown, parseOneHtmlTable } from './htmlTableParser.js';
|
||||
|
||||
// ─── 全局单例 ─────────────────────────────────────────────────
|
||||
|
||||
import { TableExtractionManager } from './TableExtractionManager.js';
|
||||
import { MinerUEngine } from './engines/MinerUEngine.js';
|
||||
|
||||
let _instance: TableExtractionManager | null = null;
|
||||
|
||||
/**
|
||||
* 获取全局 TableExtractionManager 单例。
|
||||
* 首次调用时自动注册 MinerU 引擎(需要 MINERU_API_TOKEN 环境变量)。
|
||||
*/
|
||||
export function getTableExtractionManager(): TableExtractionManager {
|
||||
if (!_instance) {
|
||||
_instance = new TableExtractionManager('mineru');
|
||||
|
||||
if (process.env.MINERU_API_TOKEN) {
|
||||
_instance.register(new MinerUEngine());
|
||||
}
|
||||
}
|
||||
return _instance;
|
||||
}
|
||||
89
backend/src/common/document/tableExtraction/types.ts
Normal file
89
backend/src/common/document/tableExtraction/types.ts
Normal file
@@ -0,0 +1,89 @@
|
||||
/**
|
||||
* PDF 表格提取引擎 — 统一类型定义
|
||||
*
|
||||
* 核心原则:使用者只需提交 PDF,获取 ExtractedTable[],无需关心底层引擎实现。
|
||||
* 所有引擎适配器必须实现 ITableExtractionEngine 接口。
|
||||
*/
|
||||
|
||||
// ─── 引擎接口 ───────────────────────────────────────────────
|
||||
|
||||
export interface ITableExtractionEngine {
|
||||
/** 引擎唯一标识 (如 'mineru', 'qwen3vl', 'paddle') */
|
||||
readonly name: string;
|
||||
|
||||
/** 引擎展示名 */
|
||||
readonly displayName: string;
|
||||
|
||||
/** 从 PDF Buffer 提取表格 */
|
||||
extractTables(
|
||||
pdf: Buffer,
|
||||
filename: string,
|
||||
options?: ExtractionOptions,
|
||||
): Promise<ExtractionResult>;
|
||||
}
|
||||
|
||||
// ─── 选项与结果 ──────────────────────────────────────────────
|
||||
|
||||
export interface ExtractionOptions {
|
||||
/** 语言提示,部分引擎可据此优化识别 */
|
||||
language?: 'zh' | 'en' | 'auto';
|
||||
/** 指定页码范围,如 [1,2,5] */
|
||||
pages?: number[];
|
||||
/** 是否保留原始 HTML / Markdown 输出 */
|
||||
keepRaw?: boolean;
|
||||
}
|
||||
|
||||
export interface ExtractionResult {
|
||||
/** 提取出的表格列表 */
|
||||
tables: ExtractedTable[];
|
||||
/** 使用的引擎名称 */
|
||||
engine: string;
|
||||
/** 处理耗时 (ms) */
|
||||
duration: number;
|
||||
/** PDF 总页数 (部分引擎可返回) */
|
||||
pageCount?: number;
|
||||
/** 引擎返回的完整 Markdown (可选) */
|
||||
fullMarkdown?: string;
|
||||
}
|
||||
|
||||
// ─── 表格数据结构 ─────────────────────────────────────────────
|
||||
|
||||
export interface ExtractedTable {
|
||||
/** 表格标题 (如 "Table 1. Baseline characteristics") */
|
||||
title: string;
|
||||
/** 表头 */
|
||||
headers: string[];
|
||||
/** 数据行 (二维数组,每行长度与 headers 一致) */
|
||||
rows: string[][];
|
||||
/** 合并单元格信息 */
|
||||
mergedCells: MergedCell[];
|
||||
/** 表格脚注 */
|
||||
footnotes: string[];
|
||||
/** 所在 PDF 页码 (1-based) */
|
||||
pageNumber?: number;
|
||||
/** 原始 HTML 片段 (MinerU 等引擎直接输出 HTML) */
|
||||
rawHtml?: string;
|
||||
/** 原始 Markdown 片段 */
|
||||
rawMarkdown?: string;
|
||||
}
|
||||
|
||||
export interface MergedCell {
|
||||
/** 起始行号 (0-based) */
|
||||
row: number;
|
||||
/** 起始列号 (0-based) */
|
||||
col: number;
|
||||
/** 行跨度 */
|
||||
rowSpan: number;
|
||||
/** 列跨度 */
|
||||
colSpan: number;
|
||||
}
|
||||
|
||||
// ─── 引擎类型 ─────────────────────────────────────────────────
|
||||
|
||||
export type EngineType =
|
||||
| 'mineru'
|
||||
| 'qwen3vl'
|
||||
| 'paddle'
|
||||
| 'qwenocr'
|
||||
| 'docling'
|
||||
| 'deepseek';
|
||||
149
backend/src/tests/test-table-extraction.ts
Normal file
149
backend/src/tests/test-table-extraction.ts
Normal file
@@ -0,0 +1,149 @@
|
||||
/**
|
||||
* PDF 表格提取引擎集成测试
|
||||
*
|
||||
* 使用 MinerU Cloud API 从真实医学 PDF 中提取表格,验证引擎完整流程。
|
||||
*
|
||||
* 用法:
|
||||
* npx tsx src/tests/test-table-extraction.ts <pdf文件路径>
|
||||
*
|
||||
* 示例:
|
||||
* npx tsx src/tests/test-table-extraction.ts "../docs/03-业务模块/ASL-AI智能文献/05-测试文档/PDF/Herrschaft 2012.pdf"
|
||||
*/
|
||||
|
||||
import { config } from 'dotenv';
|
||||
config();
|
||||
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import {
|
||||
getTableExtractionManager,
|
||||
type ExtractedTable,
|
||||
} from '../common/document/tableExtraction/index.js';
|
||||
|
||||
// ─── 主流程 ───────────────────────────────────────────────────
|
||||
|
||||
async function main() {
|
||||
console.log('========================================');
|
||||
console.log(' PDF 表格提取引擎 — 集成测试');
|
||||
console.log('========================================\n');
|
||||
|
||||
// 1. 确定测试文件
|
||||
const pdfPath = process.argv[2] || findDefaultTestPdf();
|
||||
if (!pdfPath || !fs.existsSync(pdfPath)) {
|
||||
console.error(`文件不存在: ${pdfPath || '(未指定)'}`);
|
||||
console.log('\n用法: npx tsx src/tests/test-table-extraction.ts <pdf路径>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const filename = path.basename(pdfPath);
|
||||
const fileSize = fs.statSync(pdfPath).size;
|
||||
console.log(`文件: ${filename}`);
|
||||
console.log(`大小: ${(fileSize / 1024).toFixed(1)} KB`);
|
||||
console.log(`路径: ${path.resolve(pdfPath)}\n`);
|
||||
|
||||
// 2. 检查环境变量
|
||||
if (!process.env.MINERU_API_TOKEN) {
|
||||
console.error('MINERU_API_TOKEN 未设置,请检查 backend/.env');
|
||||
process.exit(1);
|
||||
}
|
||||
console.log('MINERU_API_TOKEN: ...已配置');
|
||||
console.log(`MINERU_API_BASE: ${process.env.MINERU_API_BASE || 'https://mineru.net/api/v4'}`);
|
||||
console.log(`MINERU_MODEL_VERSION: ${process.env.MINERU_MODEL_VERSION || 'vlm'}\n`);
|
||||
|
||||
// 3. 获取引擎管理器
|
||||
const manager = getTableExtractionManager();
|
||||
console.log(`已注册引擎: [${manager.availableEngines().join(', ')}]`);
|
||||
console.log('默认引擎: mineru\n');
|
||||
|
||||
// 4. 执行提取
|
||||
console.log('--- 开始提取 ---\n');
|
||||
const pdfBuffer = fs.readFileSync(pdfPath);
|
||||
|
||||
try {
|
||||
const result = await manager.extractTables(pdfBuffer, filename, {
|
||||
keepRaw: true,
|
||||
});
|
||||
|
||||
// 5. 输出结果
|
||||
console.log('\n--- 提取完成 ---\n');
|
||||
console.log(`引擎: ${result.engine}`);
|
||||
console.log(`耗时: ${(result.duration / 1000).toFixed(1)}s`);
|
||||
console.log(`PDF 页数: ${result.pageCount ?? '未知'}`);
|
||||
console.log(`检出表格: ${result.tables.length} 个\n`);
|
||||
|
||||
if (result.tables.length === 0) {
|
||||
console.log('未检出任何表格。');
|
||||
}
|
||||
|
||||
for (let i = 0; i < result.tables.length; i++) {
|
||||
printTable(i, result.tables[i]);
|
||||
}
|
||||
|
||||
// 6. 保存完整 Markdown (可选)
|
||||
if (result.fullMarkdown) {
|
||||
const outDir = path.resolve(process.cwd(), 'test-output');
|
||||
if (!fs.existsSync(outDir)) fs.mkdirSync(outDir, { recursive: true });
|
||||
const mdPath = path.join(outDir, `${filename}.md`);
|
||||
fs.writeFileSync(mdPath, result.fullMarkdown, 'utf-8');
|
||||
console.log(`\n完整 Markdown 已保存: ${mdPath}`);
|
||||
}
|
||||
|
||||
console.log('\n测试通过');
|
||||
} catch (err: any) {
|
||||
console.error('\n提取失败:', err.message);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// ─── 工具函数 ─────────────────────────────────────────────────
|
||||
|
||||
function printTable(index: number, table: ExtractedTable) {
|
||||
console.log(`────────────────────────────────────────`);
|
||||
console.log(`表格 ${index + 1}: ${table.title || '(无标题)'}`);
|
||||
console.log(` 列数: ${table.headers.length}`);
|
||||
console.log(` 行数: ${table.rows.length}`);
|
||||
console.log(` 合并单元格: ${table.mergedCells.length}`);
|
||||
if (table.pageNumber) console.log(` 页码: ${table.pageNumber}`);
|
||||
|
||||
// 打印表头
|
||||
if (table.headers.length > 0) {
|
||||
const headerPreview = table.headers
|
||||
.map((h) => truncate(h, 20))
|
||||
.join(' | ');
|
||||
console.log(` 表头: ${headerPreview}`);
|
||||
}
|
||||
|
||||
// 打印前 3 行数据
|
||||
const previewRows = table.rows.slice(0, 3);
|
||||
for (const row of previewRows) {
|
||||
const rowPreview = row.map((c) => truncate(c, 20)).join(' | ');
|
||||
console.log(` ${rowPreview}`);
|
||||
}
|
||||
if (table.rows.length > 3) {
|
||||
console.log(` ... 还有 ${table.rows.length - 3} 行`);
|
||||
}
|
||||
console.log('');
|
||||
}
|
||||
|
||||
function truncate(s: string, maxLen: number): string {
|
||||
if (s.length <= maxLen) return s.padEnd(maxLen);
|
||||
return s.substring(0, maxLen - 2) + '..';
|
||||
}
|
||||
|
||||
function findDefaultTestPdf(): string | undefined {
|
||||
const testDir = path.resolve(
|
||||
process.cwd(),
|
||||
'../docs/03-业务模块/ASL-AI智能文献/05-测试文档/PDF',
|
||||
);
|
||||
if (!fs.existsSync(testDir)) return undefined;
|
||||
const files = fs.readdirSync(testDir).filter((f) => f.endsWith('.pdf'));
|
||||
if (files.length === 0) return undefined;
|
||||
return path.join(testDir, files[0]);
|
||||
}
|
||||
|
||||
// ─── 执行 ─────────────────────────────────────────────────────
|
||||
|
||||
main().catch((err) => {
|
||||
console.error('未捕获的错误:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user