Files
AIclinicalresearch/extraction_service/services/pdf_markdown_processor.py
HaHafeng 40c2f8e148 feat(rag): Complete RAG engine implementation with pgvector
Major Features:
- Created ekb_schema (13th schema) with 3 tables: KB/Document/Chunk
- Implemented EmbeddingService (text-embedding-v4, 1024-dim vectors)
- Implemented ChunkService (smart Markdown chunking)
- Implemented VectorSearchService (multi-query + hybrid search)
- Implemented RerankService (qwen3-rerank)
- Integrated DeepSeek V3 QueryRewriter for cross-language search
- Python service: Added pymupdf4llm for PDF-to-Markdown conversion
- PKB: Dual-mode adapter (pgvector/dify/hybrid)

Architecture:
- Brain-Hand Model: Business layer (DeepSeek) + Engine layer (pgvector)
- Cross-language support: Chinese query matches English documents
- Small Embedding (1024) + Strong Reranker strategy

Performance:
- End-to-end latency: 2.5s
- Cost per query: 0.0025 RMB
- Accuracy improvement: +20.5% (cross-language)

Tests:
- test-embedding-service.ts: Vector embedding verified
- test-rag-e2e.ts: Full pipeline tested
- test-rerank.ts: Rerank quality validated
- test-query-rewrite.ts: Cross-language search verified
- test-pdf-ingest.ts: Real PDF document tested (Dongen 2003.pdf)

Documentation:
- Added 05-RAG-Engine-User-Guide.md
- Added 02-Document-Processing-User-Guide.md
- Updated system status documentation

Status: Production ready
2026-01-21 20:24:29 +08:00

147 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
PDF Markdown 处理器 - 基于 pymupdf4llm
特点:
- 输出 LLM 友好的 Markdown 格式
- 完整保留表格结构
- 自动检测扫描件并返回友好提示
- 零 OCR只处理电子版 PDF
"""
import pymupdf4llm
from pathlib import Path
from typing import Dict, Any, Optional, List
from loguru import logger
class PdfMarkdownProcessor:
"""PDF → Markdown 处理器"""
# 扫描件检测阈值:提取文本少于此字符数视为扫描件
MIN_TEXT_THRESHOLD = 50
def __init__(self, image_dir: str = "./images"):
self.image_dir = image_dir
def to_markdown(
self,
pdf_path: str,
page_chunks: bool = False,
extract_images: bool = False,
dpi: int = 150
) -> Dict[str, Any]:
"""
PDF 转 Markdown仅支持电子版
Args:
pdf_path: PDF 文件路径
page_chunks: 是否按页分块
extract_images: 是否提取图片(默认关闭,节省空间)
dpi: 图片分辨率
Returns:
{
"success": True,
"markdown": "Markdown 文本",
"metadata": { "page_count": 10, "char_count": 5000 },
"is_scanned": False
}
"""
filename = Path(pdf_path).name
try:
logger.info(f"开始使用 pymupdf4llm 处理: {filename}")
# 调用 pymupdf4llm
md_text = pymupdf4llm.to_markdown(
pdf_path,
page_chunks=page_chunks,
write_images=extract_images,
image_path=self.image_dir if extract_images else None,
dpi=dpi,
show_progress=False
)
# 如果返回的是列表page_chunks=True合并为字符串
if isinstance(md_text, list):
md_text = "\n\n---\n\n".join([
f"## Page {i+1}\n\n{page.get('text', '')}"
for i, page in enumerate(md_text)
])
char_count = len(md_text.strip())
# 质量检查:检测是否为扫描件
if char_count < self.MIN_TEXT_THRESHOLD:
logger.warning(f"PDF 文本过少 ({char_count} 字符),可能为扫描件: {filename}")
return {
"success": True,
"markdown": self._scan_pdf_hint(filename, char_count),
"metadata": {
"page_count": self._get_page_count(pdf_path),
"char_count": char_count,
"is_scanned": True
},
"is_scanned": True
}
# 获取页数
page_count = self._get_page_count(pdf_path)
logger.info(f"PDF 处理完成: {page_count} 页, {char_count} 字符")
return {
"success": True,
"markdown": md_text,
"metadata": {
"page_count": page_count,
"char_count": char_count,
"is_scanned": False
},
"is_scanned": False
}
except Exception as e:
logger.error(f"PDF 解析失败: {filename}, 错误: {e}")
return {
"success": False,
"error": str(e),
"markdown": f"> **系统提示**:文档 `{filename}` 解析失败: {str(e)}"
}
def _get_page_count(self, pdf_path: str) -> int:
"""获取 PDF 页数"""
try:
import fitz # pymupdf
doc = fitz.open(pdf_path)
count = len(doc)
doc.close()
return count
except:
return 0
def _scan_pdf_hint(self, filename: str, char_count: int) -> str:
"""生成扫描件友好提示"""
return f"""> **系统提示**:文档 `{filename}` 似乎是扫描件(图片型 PDF
>
> - 提取文本量:{char_count} 字符
> - 本系统暂不支持扫描版 PDF 的文字识别
> - 建议:请上传电子版 PDF或将扫描件转换为可编辑格式后重新上传"""
# 便捷函数
def extract_pdf_to_markdown(pdf_path: str) -> Dict[str, Any]:
"""
PDF 转 Markdown便捷函数
Args:
pdf_path: PDF 文件路径
Returns:
处理结果字典
"""
processor = PdfMarkdownProcessor()
return processor.to_markdown(pdf_path)