Files
AIclinicalresearch/extraction_service/services/pdf_processor.py
HaHafeng 40c2f8e148 feat(rag): Complete RAG engine implementation with pgvector
Major Features:
- Created ekb_schema (13th schema) with 3 tables: KB/Document/Chunk
- Implemented EmbeddingService (text-embedding-v4, 1024-dim vectors)
- Implemented ChunkService (smart Markdown chunking)
- Implemented VectorSearchService (multi-query + hybrid search)
- Implemented RerankService (qwen3-rerank)
- Integrated DeepSeek V3 QueryRewriter for cross-language search
- Python service: Added pymupdf4llm for PDF-to-Markdown conversion
- PKB: Dual-mode adapter (pgvector/dify/hybrid)

Architecture:
- Brain-Hand Model: Business layer (DeepSeek) + Engine layer (pgvector)
- Cross-language support: Chinese query matches English documents
- Small Embedding (1024) + Strong Reranker strategy

Performance:
- End-to-end latency: 2.5s
- Cost per query: 0.0025 RMB
- Accuracy improvement: +20.5% (cross-language)

Tests:
- test-embedding-service.ts: Vector embedding verified
- test-rag-e2e.ts: Full pipeline tested
- test-rerank.ts: Rerank quality validated
- test-query-rewrite.ts: Cross-language search verified
- test-pdf-ingest.ts: Real PDF document tested (Dongen 2003.pdf)

Documentation:
- Added 05-RAG-Engine-User-Guide.md
- Added 02-Document-Processing-User-Guide.md
- Updated system status documentation

Status: Production ready
2026-01-21 20:24:29 +08:00

115 lines
2.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
PDF处理主服务
策略:
- 所有 PDF 统一使用 PyMuPDF 处理(快速、稳定)
- RAG 引擎推荐使用 pymupdf4llm见 pdf_markdown_processor.py
注意Nougat 已废弃,不再使用
"""
from typing import Dict, Any, Optional
from loguru import logger
from .language_detector import detect_language
from .pdf_extractor import extract_pdf_pymupdf
def extract_pdf(
file_path: str,
force_method: Optional[str] = None
) -> Dict[str, Any]:
"""
PDF提取主函数
处理流程:
1. 检测语言(仅用于元数据)
2. 使用 PyMuPDF 提取文本
注意:对于 RAG 引擎,推荐使用 /api/document/to-markdown 接口,
它使用 pymupdf4llm 提供更好的表格和结构支持。
Args:
file_path: PDF文件路径
force_method: 保留参数(已废弃,仅支持 'pymupdf'
Returns:
{
"success": True,
"method": "pymupdf",
"reason": "...",
"text": "提取的文本",
"metadata": {...}
}
"""
try:
logger.info(f"开始处理PDF: {file_path}")
# Step 1: 语言检测(仅用于元数据)
logger.info("[Step 1] 检测PDF语言...")
language = detect_language(file_path)
logger.info(f"检测结果: {language}")
# Step 2: 使用 PyMuPDF 提取
logger.info("[Step 2] 使用PyMuPDF处理")
result = extract_pdf_pymupdf(file_path)
if result['success']:
result['reason'] = 'pymupdf_standard'
result['detected_language'] = language
logger.info("✅ PyMuPDF处理成功")
else:
logger.error("❌ PyMuPDF处理失败")
return result
except Exception as e:
logger.error(f"PDF处理失败: {str(e)}")
return {
"success": False,
"error": str(e),
"method": "pymupdf"
}
def get_pdf_processing_strategy(file_path: str) -> Dict[str, Any]:
"""
获取PDF处理策略不实际提取
用于预览将使用哪种方法
Args:
file_path: PDF文件路径
Returns:
{
"detected_language": "chinese" | "english",
"recommended_method": "pymupdf",
"reason": "...",
"nougat_available": False # 已废弃
}
"""
try:
# 检测语言
language = detect_language(file_path)
return {
"detected_language": language,
"recommended_method": "pymupdf",
"reason": "统一使用 PyMuPDF 处理RAG 引擎推荐使用 /api/document/to-markdown",
"nougat_available": False # 已废弃
}
except Exception as e:
logger.error(f"获取处理策略失败: {str(e)}")
return {
"error": str(e)
}