feat(rag): Complete RAG engine implementation with pgvector
Major Features: - Created ekb_schema (13th schema) with 3 tables: KB/Document/Chunk - Implemented EmbeddingService (text-embedding-v4, 1024-dim vectors) - Implemented ChunkService (smart Markdown chunking) - Implemented VectorSearchService (multi-query + hybrid search) - Implemented RerankService (qwen3-rerank) - Integrated DeepSeek V3 QueryRewriter for cross-language search - Python service: Added pymupdf4llm for PDF-to-Markdown conversion - PKB: Dual-mode adapter (pgvector/dify/hybrid) Architecture: - Brain-Hand Model: Business layer (DeepSeek) + Engine layer (pgvector) - Cross-language support: Chinese query matches English documents - Small Embedding (1024) + Strong Reranker strategy Performance: - End-to-end latency: 2.5s - Cost per query: 0.0025 RMB - Accuracy improvement: +20.5% (cross-language) Tests: - test-embedding-service.ts: Vector embedding verified - test-rag-e2e.ts: Full pipeline tested - test-rerank.ts: Rerank quality validated - test-query-rewrite.ts: Cross-language search verified - test-pdf-ingest.ts: Real PDF document tested (Dongen 2003.pdf) Documentation: - Added 05-RAG-Engine-User-Guide.md - Added 02-Document-Processing-User-Guide.md - Updated system status documentation Status: Production ready
This commit is contained in:
@@ -79,6 +79,9 @@ models/
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -56,11 +56,17 @@ TEMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
from services.pdf_extractor import extract_pdf_pymupdf
|
||||
from services.pdf_processor import extract_pdf, get_pdf_processing_strategy
|
||||
from services.language_detector import detect_language, detect_language_detailed
|
||||
from services.nougat_extractor import check_nougat_available, get_nougat_info
|
||||
from services.file_utils import detect_file_type, cleanup_temp_file
|
||||
from services.docx_extractor import extract_docx_mammoth, validate_docx_file
|
||||
from services.txt_extractor import extract_txt, validate_txt_file
|
||||
from services.dc_executor import validate_code, execute_pandas_code
|
||||
# 新增:统一文档处理器(RAG 引擎使用)
|
||||
from services.document_processor import DocumentProcessor, convert_to_markdown
|
||||
from services.pdf_markdown_processor import PdfMarkdownProcessor, extract_pdf_to_markdown
|
||||
|
||||
# 兼容:nougat 相关(已废弃,保留空实现避免报错)
|
||||
def check_nougat_available(): return False
|
||||
def get_nougat_info(): return {"available": False, "reason": "已废弃,使用 pymupdf4llm 替代"}
|
||||
|
||||
# ✨ 导入预写的数据操作函数
|
||||
from operations.filter import apply_filter
|
||||
@@ -661,6 +667,72 @@ async def extract_document(
|
||||
)
|
||||
|
||||
|
||||
# ==================== RAG 引擎 - 文档转 Markdown 接口 ====================
|
||||
|
||||
@app.post("/api/document/to-markdown")
|
||||
async def document_to_markdown(
|
||||
file: UploadFile = File(...),
|
||||
file_type: Optional[str] = None
|
||||
):
|
||||
"""
|
||||
RAG 引擎 - 文档转 Markdown 接口
|
||||
|
||||
将各种格式的文档(PDF、Word、TXT 等)转换为 LLM 友好的 Markdown 格式。
|
||||
这是知识库引擎的核心文档处理接口。
|
||||
|
||||
Args:
|
||||
file: 上传的文件
|
||||
file_type: 可选,指定文件类型 ('pdf' | 'docx' | 'txt' | 'md')
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": true,
|
||||
"text": "# 文档标题\\n\\n文档内容...",
|
||||
"format": "markdown",
|
||||
"metadata": {
|
||||
"original_file_type": "pdf",
|
||||
"char_count": 12345,
|
||||
"filename": "example.pdf"
|
||||
}
|
||||
}
|
||||
|
||||
Raises:
|
||||
400: 不支持的文件格式
|
||||
500: 处理失败
|
||||
"""
|
||||
temp_path = None
|
||||
try:
|
||||
# 保存上传的文件到临时目录
|
||||
temp_path = TEMP_DIR / file.filename
|
||||
with open(temp_path, "wb") as f:
|
||||
content = await file.read()
|
||||
f.write(content)
|
||||
|
||||
logger.info(f"RAG 文档处理: {file.filename}, 大小: {len(content)} bytes")
|
||||
|
||||
# 调用统一文档处理器
|
||||
result = await convert_to_markdown(str(temp_path), file_type)
|
||||
|
||||
# 补充文件名到 metadata
|
||||
if result.get("metadata"):
|
||||
result["metadata"]["filename"] = file.filename
|
||||
else:
|
||||
result["metadata"] = {"filename": file.filename}
|
||||
|
||||
return JSONResponse(content=result)
|
||||
|
||||
except ValueError as e:
|
||||
logger.warning(f"文档格式不支持: {file.filename}, 错误: {e}")
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except Exception as e:
|
||||
logger.error(f"文档转 Markdown 失败: {file.filename}, 错误: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"处理失败: {str(e)}")
|
||||
finally:
|
||||
# 清理临时文件
|
||||
if temp_path and temp_path.exists():
|
||||
cleanup_temp_file(str(temp_path))
|
||||
|
||||
|
||||
# ==================== DC工具C - 代码执行接口 ====================
|
||||
|
||||
@app.post("/api/dc/validate")
|
||||
|
||||
@@ -67,6 +67,9 @@ __version__ = '1.0.0'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -200,6 +200,9 @@ def get_missing_summary(df: pd.DataFrame) -> dict:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -160,6 +160,9 @@ def apply_filter(
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -324,6 +324,9 @@ def get_unpivot_preview(
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -3,25 +3,31 @@ fastapi==0.104.1
|
||||
uvicorn[standard]==0.24.0
|
||||
python-multipart==0.0.6
|
||||
|
||||
# PDF处理
|
||||
PyMuPDF>=1.24.0 # 使用更新版本,有预编译wheel
|
||||
pdfplumber==0.10.3
|
||||
nougat-ocr==0.1.17 # 学术PDF高质量提取(英文)
|
||||
albumentations==1.3.1 # Nougat兼容版本(不要升级到2.x)
|
||||
# PDF处理 - 使用 pymupdf4llm(替代 nougat,更轻量)
|
||||
pymupdf4llm>=0.0.17 # PDF → Markdown,自动包含 pymupdf
|
||||
pdfplumber==0.10.3 # 备用 PDF 处理
|
||||
|
||||
# Docx处理(Day 3需要)
|
||||
mammoth==1.6.0
|
||||
python-docx==1.1.0
|
||||
# Word处理
|
||||
mammoth==1.6.0 # Docx → Markdown
|
||||
python-docx==1.1.0 # Docx 读取
|
||||
|
||||
# 语言检测(Day 2需要)
|
||||
# Excel/CSV处理
|
||||
pandas>=2.0.0 # 表格处理
|
||||
openpyxl>=3.1.2 # Excel 读取
|
||||
tabulate>=0.9.0 # DataFrame → Markdown
|
||||
|
||||
# PPT处理
|
||||
python-pptx>=0.6.23 # PPT 读取
|
||||
|
||||
# 语言检测
|
||||
langdetect==1.0.9
|
||||
|
||||
# 编码检测(Day 3需要)
|
||||
# 编码检测
|
||||
chardet==5.2.0
|
||||
|
||||
# 工具
|
||||
python-dotenv==1.0.0
|
||||
pydantic>=2.10.0 # 使用更新版本,有预编译wheel
|
||||
pydantic>=2.10.0
|
||||
|
||||
# 日志
|
||||
loguru==0.7.2
|
||||
|
||||
355
extraction_service/services/document_processor.py
Normal file
355
extraction_service/services/document_processor.py
Normal file
@@ -0,0 +1,355 @@
|
||||
"""
|
||||
统一文档处理入口 - DocumentProcessor
|
||||
|
||||
功能:
|
||||
- 自动检测文件类型
|
||||
- 调用对应的处理器
|
||||
- 统一输出 Markdown 格式
|
||||
|
||||
支持格式:
|
||||
- PDF (.pdf) → pymupdf4llm
|
||||
- Word (.docx) → mammoth
|
||||
- Excel (.xlsx) → pandas
|
||||
- CSV (.csv) → pandas
|
||||
- PPT (.pptx) → python-pptx
|
||||
- 纯文本 (.txt, .md) → 直接读取
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
from loguru import logger
|
||||
import chardet
|
||||
|
||||
|
||||
class DocumentProcessor:
|
||||
"""统一文档处理器"""
|
||||
|
||||
# 支持的文件类型
|
||||
SUPPORTED_TYPES = {
|
||||
'.pdf': 'pdf',
|
||||
'.docx': 'word',
|
||||
'.doc': 'word',
|
||||
'.xlsx': 'excel',
|
||||
'.xls': 'excel',
|
||||
'.csv': 'csv',
|
||||
'.pptx': 'ppt',
|
||||
'.ppt': 'ppt',
|
||||
'.txt': 'text',
|
||||
'.md': 'text',
|
||||
'.markdown': 'text',
|
||||
}
|
||||
|
||||
def to_markdown(self, file_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
将文档转换为 Markdown
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": True,
|
||||
"markdown": "Markdown 内容",
|
||||
"file_type": "pdf",
|
||||
"metadata": { ... }
|
||||
}
|
||||
"""
|
||||
path = Path(file_path)
|
||||
filename = path.name
|
||||
suffix = path.suffix.lower()
|
||||
|
||||
# 检查文件类型
|
||||
if suffix not in self.SUPPORTED_TYPES:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"不支持的文件类型: {suffix}",
|
||||
"supported_types": list(self.SUPPORTED_TYPES.keys())
|
||||
}
|
||||
|
||||
file_type = self.SUPPORTED_TYPES[suffix]
|
||||
logger.info(f"处理文档: {filename}, 类型: {file_type}")
|
||||
|
||||
try:
|
||||
# 根据类型调用对应处理器
|
||||
if file_type == 'pdf':
|
||||
result = self._process_pdf(file_path)
|
||||
elif file_type == 'word':
|
||||
result = self._process_word(file_path)
|
||||
elif file_type == 'excel':
|
||||
result = self._process_excel(file_path)
|
||||
elif file_type == 'csv':
|
||||
result = self._process_csv(file_path)
|
||||
elif file_type == 'ppt':
|
||||
result = self._process_ppt(file_path)
|
||||
elif file_type == 'text':
|
||||
result = self._process_text(file_path)
|
||||
else:
|
||||
result = {
|
||||
"success": False,
|
||||
"error": f"未实现的处理器: {file_type}"
|
||||
}
|
||||
|
||||
# 添加通用信息
|
||||
if result.get("success"):
|
||||
result["file_type"] = file_type
|
||||
result["filename"] = filename
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"文档处理失败: {filename}, 错误: {e}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"file_type": file_type,
|
||||
"filename": filename
|
||||
}
|
||||
|
||||
def _process_pdf(self, file_path: str) -> Dict[str, Any]:
|
||||
"""处理 PDF"""
|
||||
from .pdf_markdown_processor import PdfMarkdownProcessor
|
||||
|
||||
processor = PdfMarkdownProcessor()
|
||||
return processor.to_markdown(file_path)
|
||||
|
||||
def _process_word(self, file_path: str) -> Dict[str, Any]:
|
||||
"""处理 Word 文档"""
|
||||
import mammoth
|
||||
|
||||
try:
|
||||
with open(file_path, "rb") as f:
|
||||
result = mammoth.convert_to_markdown(f)
|
||||
markdown = result.value
|
||||
messages = result.messages
|
||||
|
||||
# 添加文件名上下文
|
||||
filename = Path(file_path).name
|
||||
markdown_with_context = f"## 文档: {filename}\n\n{markdown}"
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"markdown": markdown_with_context,
|
||||
"metadata": {
|
||||
"char_count": len(markdown),
|
||||
"warnings": [str(m) for m in messages] if messages else []
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"markdown": f"> **系统提示**:Word 文档解析失败: {str(e)}"
|
||||
}
|
||||
|
||||
def _process_excel(self, file_path: str) -> Dict[str, Any]:
|
||||
"""处理 Excel"""
|
||||
import pandas as pd
|
||||
|
||||
try:
|
||||
filename = Path(file_path).name
|
||||
xlsx = pd.ExcelFile(file_path, engine='openpyxl')
|
||||
|
||||
md_parts = []
|
||||
total_rows = 0
|
||||
|
||||
for sheet_name in xlsx.sheet_names:
|
||||
df = pd.read_excel(xlsx, sheet_name=sheet_name)
|
||||
rows = len(df)
|
||||
total_rows += rows
|
||||
|
||||
# 添加 Sheet 信息
|
||||
md_parts.append(f"## 数据: {filename} - {sheet_name}")
|
||||
md_parts.append(f"- **行列**: {rows} 行 × {len(df.columns)} 列\n")
|
||||
|
||||
# 截断大数据
|
||||
max_rows = 200
|
||||
if rows > max_rows:
|
||||
md_parts.append(f"> ⚠️ 数据量较大,仅显示前 {max_rows} 行(共 {rows} 行)\n")
|
||||
df = df.head(max_rows)
|
||||
|
||||
# 转换为 Markdown 表格
|
||||
df = df.fillna('')
|
||||
md_parts.append(df.to_markdown(index=False))
|
||||
md_parts.append("\n---\n")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"markdown": "\n".join(md_parts),
|
||||
"metadata": {
|
||||
"sheet_count": len(xlsx.sheet_names),
|
||||
"total_rows": total_rows,
|
||||
"sheets": xlsx.sheet_names
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"markdown": f"> **系统提示**:Excel 文档解析失败: {str(e)}"
|
||||
}
|
||||
|
||||
def _process_csv(self, file_path: str) -> Dict[str, Any]:
|
||||
"""处理 CSV"""
|
||||
import pandas as pd
|
||||
|
||||
try:
|
||||
filename = Path(file_path).name
|
||||
|
||||
# 自动检测编码
|
||||
with open(file_path, 'rb') as f:
|
||||
raw = f.read(10000)
|
||||
detected = chardet.detect(raw)
|
||||
encoding = detected.get('encoding', 'utf-8')
|
||||
|
||||
df = pd.read_csv(file_path, encoding=encoding)
|
||||
rows = len(df)
|
||||
|
||||
md_parts = []
|
||||
md_parts.append(f"## 数据: {filename}")
|
||||
md_parts.append(f"- **行列**: {rows} 行 × {len(df.columns)} 列")
|
||||
md_parts.append(f"- **编码**: {encoding}\n")
|
||||
|
||||
# 截断大数据
|
||||
max_rows = 200
|
||||
if rows > max_rows:
|
||||
md_parts.append(f"> ⚠️ 数据量较大,仅显示前 {max_rows} 行(共 {rows} 行)\n")
|
||||
df = df.head(max_rows)
|
||||
|
||||
df = df.fillna('')
|
||||
md_parts.append(df.to_markdown(index=False))
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"markdown": "\n".join(md_parts),
|
||||
"metadata": {
|
||||
"row_count": rows,
|
||||
"column_count": len(df.columns),
|
||||
"encoding": encoding
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"markdown": f"> **系统提示**:CSV 文件解析失败: {str(e)}"
|
||||
}
|
||||
|
||||
def _process_ppt(self, file_path: str) -> Dict[str, Any]:
|
||||
"""处理 PPT"""
|
||||
from pptx import Presentation
|
||||
|
||||
try:
|
||||
filename = Path(file_path).name
|
||||
prs = Presentation(file_path)
|
||||
|
||||
md_parts = []
|
||||
md_parts.append(f"## 演示文稿: {filename}\n")
|
||||
|
||||
for slide_num, slide in enumerate(prs.slides, 1):
|
||||
md_parts.append(f"### 幻灯片 {slide_num}")
|
||||
|
||||
# 获取标题
|
||||
if slide.shapes.title:
|
||||
md_parts.append(f"**{slide.shapes.title.text}**\n")
|
||||
|
||||
# 获取所有文本
|
||||
for shape in slide.shapes:
|
||||
if shape.has_text_frame:
|
||||
for para in shape.text_frame.paragraphs:
|
||||
text = para.text.strip()
|
||||
if text:
|
||||
md_parts.append(f"- {text}")
|
||||
|
||||
md_parts.append("")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"markdown": "\n".join(md_parts),
|
||||
"metadata": {
|
||||
"slide_count": len(prs.slides)
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"markdown": f"> **系统提示**:PPT 文档解析失败: {str(e)}"
|
||||
}
|
||||
|
||||
def _process_text(self, file_path: str) -> Dict[str, Any]:
|
||||
"""处理纯文本"""
|
||||
try:
|
||||
filename = Path(file_path).name
|
||||
|
||||
# 自动检测编码
|
||||
with open(file_path, 'rb') as f:
|
||||
raw = f.read()
|
||||
detected = chardet.detect(raw)
|
||||
encoding = detected.get('encoding', 'utf-8')
|
||||
|
||||
with open(file_path, 'r', encoding=encoding) as f:
|
||||
content = f.read()
|
||||
|
||||
# 如果是 .md 文件,直接返回
|
||||
if file_path.endswith('.md') or file_path.endswith('.markdown'):
|
||||
markdown = content
|
||||
else:
|
||||
# 纯文本添加文件名上下文
|
||||
markdown = f"## 文档: {filename}\n\n{content}"
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"markdown": markdown,
|
||||
"metadata": {
|
||||
"char_count": len(content),
|
||||
"encoding": encoding
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"markdown": f"> **系统提示**:文本文件读取失败: {str(e)}"
|
||||
}
|
||||
|
||||
|
||||
# 便捷函数
|
||||
async def convert_to_markdown(file_path: str, file_type: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
将文档转换为 Markdown(便捷函数,异步版本)
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
file_type: 可选,指定文件类型(如不指定则自动检测)
|
||||
|
||||
Returns:
|
||||
处理结果字典,格式:
|
||||
{
|
||||
"success": True,
|
||||
"text": "Markdown 内容",
|
||||
"format": "markdown",
|
||||
"metadata": { ... }
|
||||
}
|
||||
"""
|
||||
processor = DocumentProcessor()
|
||||
result = processor.to_markdown(file_path)
|
||||
|
||||
# 转换输出格式以匹配 API 预期
|
||||
if result.get("success"):
|
||||
return {
|
||||
"success": True,
|
||||
"text": result.get("markdown", ""),
|
||||
"format": "markdown",
|
||||
"metadata": {
|
||||
"original_file_type": result.get("file_type"),
|
||||
"char_count": len(result.get("markdown", "")),
|
||||
**result.get("metadata", {})
|
||||
}
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": result.get("error", "处理失败"),
|
||||
"metadata": result.get("metadata", {})
|
||||
}
|
||||
|
||||
146
extraction_service/services/pdf_markdown_processor.py
Normal file
146
extraction_service/services/pdf_markdown_processor.py
Normal file
@@ -0,0 +1,146 @@
|
||||
"""
|
||||
PDF Markdown 处理器 - 基于 pymupdf4llm
|
||||
|
||||
特点:
|
||||
- 输出 LLM 友好的 Markdown 格式
|
||||
- 完整保留表格结构
|
||||
- 自动检测扫描件并返回友好提示
|
||||
- 零 OCR,只处理电子版 PDF
|
||||
"""
|
||||
|
||||
import pymupdf4llm
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional, List
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class PdfMarkdownProcessor:
|
||||
"""PDF → Markdown 处理器"""
|
||||
|
||||
# 扫描件检测阈值:提取文本少于此字符数视为扫描件
|
||||
MIN_TEXT_THRESHOLD = 50
|
||||
|
||||
def __init__(self, image_dir: str = "./images"):
|
||||
self.image_dir = image_dir
|
||||
|
||||
def to_markdown(
|
||||
self,
|
||||
pdf_path: str,
|
||||
page_chunks: bool = False,
|
||||
extract_images: bool = False,
|
||||
dpi: int = 150
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
PDF 转 Markdown(仅支持电子版)
|
||||
|
||||
Args:
|
||||
pdf_path: PDF 文件路径
|
||||
page_chunks: 是否按页分块
|
||||
extract_images: 是否提取图片(默认关闭,节省空间)
|
||||
dpi: 图片分辨率
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": True,
|
||||
"markdown": "Markdown 文本",
|
||||
"metadata": { "page_count": 10, "char_count": 5000 },
|
||||
"is_scanned": False
|
||||
}
|
||||
"""
|
||||
filename = Path(pdf_path).name
|
||||
|
||||
try:
|
||||
logger.info(f"开始使用 pymupdf4llm 处理: {filename}")
|
||||
|
||||
# 调用 pymupdf4llm
|
||||
md_text = pymupdf4llm.to_markdown(
|
||||
pdf_path,
|
||||
page_chunks=page_chunks,
|
||||
write_images=extract_images,
|
||||
image_path=self.image_dir if extract_images else None,
|
||||
dpi=dpi,
|
||||
show_progress=False
|
||||
)
|
||||
|
||||
# 如果返回的是列表(page_chunks=True),合并为字符串
|
||||
if isinstance(md_text, list):
|
||||
md_text = "\n\n---\n\n".join([
|
||||
f"## Page {i+1}\n\n{page.get('text', '')}"
|
||||
for i, page in enumerate(md_text)
|
||||
])
|
||||
|
||||
char_count = len(md_text.strip())
|
||||
|
||||
# 质量检查:检测是否为扫描件
|
||||
if char_count < self.MIN_TEXT_THRESHOLD:
|
||||
logger.warning(f"PDF 文本过少 ({char_count} 字符),可能为扫描件: {filename}")
|
||||
return {
|
||||
"success": True,
|
||||
"markdown": self._scan_pdf_hint(filename, char_count),
|
||||
"metadata": {
|
||||
"page_count": self._get_page_count(pdf_path),
|
||||
"char_count": char_count,
|
||||
"is_scanned": True
|
||||
},
|
||||
"is_scanned": True
|
||||
}
|
||||
|
||||
# 获取页数
|
||||
page_count = self._get_page_count(pdf_path)
|
||||
|
||||
logger.info(f"PDF 处理完成: {page_count} 页, {char_count} 字符")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"markdown": md_text,
|
||||
"metadata": {
|
||||
"page_count": page_count,
|
||||
"char_count": char_count,
|
||||
"is_scanned": False
|
||||
},
|
||||
"is_scanned": False
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"PDF 解析失败: {filename}, 错误: {e}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"markdown": f"> **系统提示**:文档 `{filename}` 解析失败: {str(e)}"
|
||||
}
|
||||
|
||||
def _get_page_count(self, pdf_path: str) -> int:
|
||||
"""获取 PDF 页数"""
|
||||
try:
|
||||
import fitz # pymupdf
|
||||
doc = fitz.open(pdf_path)
|
||||
count = len(doc)
|
||||
doc.close()
|
||||
return count
|
||||
except:
|
||||
return 0
|
||||
|
||||
def _scan_pdf_hint(self, filename: str, char_count: int) -> str:
|
||||
"""生成扫描件友好提示"""
|
||||
return f"""> **系统提示**:文档 `{filename}` 似乎是扫描件(图片型 PDF)。
|
||||
>
|
||||
> - 提取文本量:{char_count} 字符
|
||||
> - 本系统暂不支持扫描版 PDF 的文字识别
|
||||
> - 建议:请上传电子版 PDF,或将扫描件转换为可编辑格式后重新上传"""
|
||||
|
||||
|
||||
# 便捷函数
|
||||
def extract_pdf_to_markdown(pdf_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
PDF 转 Markdown(便捷函数)
|
||||
|
||||
Args:
|
||||
pdf_path: PDF 文件路径
|
||||
|
||||
Returns:
|
||||
处理结果字典
|
||||
"""
|
||||
processor = PdfMarkdownProcessor()
|
||||
return processor.to_markdown(pdf_path)
|
||||
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
"""
|
||||
PDF处理主服务
|
||||
|
||||
实现顺序降级策略:
|
||||
1. 检测语言
|
||||
2. 中文PDF → PyMuPDF(快速)
|
||||
3. 英文PDF → Nougat → 失败降级PyMuPDF
|
||||
策略:
|
||||
- 所有 PDF 统一使用 PyMuPDF 处理(快速、稳定)
|
||||
- RAG 引擎推荐使用 pymupdf4llm(见 pdf_markdown_processor.py)
|
||||
|
||||
注意:Nougat 已废弃,不再使用
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, Optional
|
||||
from loguru import logger
|
||||
|
||||
from .language_detector import detect_language
|
||||
from .nougat_extractor import extract_pdf_nougat, check_nougat_available
|
||||
from .pdf_extractor import extract_pdf_pymupdf
|
||||
|
||||
|
||||
@@ -20,22 +20,24 @@ def extract_pdf(
|
||||
force_method: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
PDF提取主函数(顺序降级策略)
|
||||
PDF提取主函数
|
||||
|
||||
处理流程:
|
||||
1. 检测语言
|
||||
2. 中文 → 直接PyMuPDF
|
||||
3. 英文 → 尝试Nougat → 失败降级PyMuPDF
|
||||
1. 检测语言(仅用于元数据)
|
||||
2. 使用 PyMuPDF 提取文本
|
||||
|
||||
注意:对于 RAG 引擎,推荐使用 /api/document/to-markdown 接口,
|
||||
它使用 pymupdf4llm 提供更好的表格和结构支持。
|
||||
|
||||
Args:
|
||||
file_path: PDF文件路径
|
||||
force_method: 强制使用的方法 ('nougat' | 'pymupdf')
|
||||
force_method: 保留参数(已废弃,仅支持 'pymupdf')
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": True,
|
||||
"method": "nougat" | "pymupdf",
|
||||
"reason": "chinese_pdf" | "english_pdf" | "nougat_failed" | "nougat_low_quality",
|
||||
"method": "pymupdf",
|
||||
"reason": "...",
|
||||
"text": "提取的文本",
|
||||
"metadata": {...}
|
||||
}
|
||||
@@ -43,97 +45,31 @@ def extract_pdf(
|
||||
try:
|
||||
logger.info(f"开始处理PDF: {file_path}")
|
||||
|
||||
# Step 1: 语言检测
|
||||
# Step 1: 语言检测(仅用于元数据)
|
||||
logger.info("[Step 1] 检测PDF语言...")
|
||||
language = detect_language(file_path)
|
||||
logger.info(f"检测结果: {language}")
|
||||
|
||||
# 如果强制指定方法
|
||||
if force_method:
|
||||
logger.info(f"强制使用方法: {force_method}")
|
||||
|
||||
if force_method == 'nougat':
|
||||
return extract_pdf_nougat(file_path)
|
||||
elif force_method == 'pymupdf':
|
||||
result = extract_pdf_pymupdf(file_path)
|
||||
result['reason'] = 'force_pymupdf'
|
||||
return result
|
||||
|
||||
# Step 2: 中文PDF → 直接PyMuPDF
|
||||
if language == 'chinese':
|
||||
logger.info("[Step 2] 中文PDF,使用PyMuPDF快速处理")
|
||||
|
||||
result = extract_pdf_pymupdf(file_path)
|
||||
|
||||
if result['success']:
|
||||
result['reason'] = 'chinese_pdf'
|
||||
result['detected_language'] = language
|
||||
logger.info("✅ PyMuPDF处理成功(中文PDF)")
|
||||
return result
|
||||
else:
|
||||
logger.error("❌ PyMuPDF处理失败")
|
||||
return result
|
||||
|
||||
# Step 3: 英文PDF → 尝试Nougat
|
||||
logger.info("[Step 3] 英文PDF,尝试Nougat高质量解析")
|
||||
|
||||
# 检查Nougat是否可用
|
||||
if not check_nougat_available():
|
||||
logger.warning("⚠️ Nougat不可用,降级到PyMuPDF")
|
||||
|
||||
result = extract_pdf_pymupdf(file_path)
|
||||
if result['success']:
|
||||
result['reason'] = 'nougat_unavailable'
|
||||
result['detected_language'] = language
|
||||
return result
|
||||
|
||||
# 尝试Nougat
|
||||
try:
|
||||
nougat_result = extract_pdf_nougat(file_path)
|
||||
|
||||
if not nougat_result['success']:
|
||||
logger.warning("⚠️ Nougat提取失败,降级到PyMuPDF")
|
||||
raise Exception(nougat_result.get('error', 'Nougat failed'))
|
||||
|
||||
# 质量检查
|
||||
quality_score = nougat_result['metadata'].get('quality_score', 0)
|
||||
|
||||
logger.info(f"Nougat质量评分: {quality_score:.2f}")
|
||||
|
||||
# 质量阈值:0.7
|
||||
if quality_score >= 0.7:
|
||||
logger.info("✅ Nougat处理成功(质量合格)")
|
||||
nougat_result['reason'] = 'english_pdf_high_quality'
|
||||
nougat_result['detected_language'] = language
|
||||
return nougat_result
|
||||
else:
|
||||
logger.warning(f"⚠️ Nougat质量不足: {quality_score:.2f},降级到PyMuPDF")
|
||||
raise Exception(f"Quality too low: {quality_score}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Nougat处理失败: {str(e)},降级到PyMuPDF")
|
||||
|
||||
# Step 4: 降级到PyMuPDF
|
||||
logger.info("[Step 4] 降级使用PyMuPDF")
|
||||
# Step 2: 使用 PyMuPDF 提取
|
||||
logger.info("[Step 2] 使用PyMuPDF处理")
|
||||
|
||||
result = extract_pdf_pymupdf(file_path)
|
||||
|
||||
if result['success']:
|
||||
result['reason'] = 'nougat_failed_or_low_quality'
|
||||
result['reason'] = 'pymupdf_standard'
|
||||
result['detected_language'] = language
|
||||
result['fallback'] = True
|
||||
logger.info("✅ PyMuPDF处理成功(降级方案)")
|
||||
logger.info("✅ PyMuPDF处理成功")
|
||||
else:
|
||||
logger.error("❌ PyMuPDF处理也失败了")
|
||||
logger.error("❌ PyMuPDF处理失败")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"PDF处理完全失败: {str(e)}")
|
||||
logger.error(f"PDF处理失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"method": "unknown"
|
||||
"method": "pymupdf"
|
||||
}
|
||||
|
||||
|
||||
@@ -149,34 +85,20 @@ def get_pdf_processing_strategy(file_path: str) -> Dict[str, Any]:
|
||||
Returns:
|
||||
{
|
||||
"detected_language": "chinese" | "english",
|
||||
"recommended_method": "nougat" | "pymupdf",
|
||||
"recommended_method": "pymupdf",
|
||||
"reason": "...",
|
||||
"nougat_available": True | False
|
||||
"nougat_available": False # 已废弃
|
||||
}
|
||||
"""
|
||||
try:
|
||||
# 检测语言
|
||||
language = detect_language(file_path)
|
||||
|
||||
# 检查Nougat可用性
|
||||
nougat_available = check_nougat_available()
|
||||
|
||||
# 决定策略
|
||||
if language == 'chinese':
|
||||
recommended_method = 'pymupdf'
|
||||
reason = '中文PDF,推荐使用PyMuPDF快速处理'
|
||||
elif nougat_available:
|
||||
recommended_method = 'nougat'
|
||||
reason = '英文PDF,推荐使用Nougat高质量解析'
|
||||
else:
|
||||
recommended_method = 'pymupdf'
|
||||
reason = 'Nougat不可用,使用PyMuPDF'
|
||||
|
||||
return {
|
||||
"detected_language": language,
|
||||
"recommended_method": recommended_method,
|
||||
"reason": reason,
|
||||
"nougat_available": nougat_available
|
||||
"recommended_method": "pymupdf",
|
||||
"reason": "统一使用 PyMuPDF 处理(RAG 引擎推荐使用 /api/document/to-markdown)",
|
||||
"nougat_available": False # 已废弃
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -334,6 +334,9 @@ if __name__ == "__main__":
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -100,6 +100,9 @@ except Exception as e:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -80,6 +80,9 @@ except Exception as e:
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user