feat(rag): Complete RAG engine implementation with pgvector

Major Features:
- Created ekb_schema (13th schema) with 3 tables: KB/Document/Chunk
- Implemented EmbeddingService (text-embedding-v4, 1024-dim vectors)
- Implemented ChunkService (smart Markdown chunking)
- Implemented VectorSearchService (multi-query + hybrid search)
- Implemented RerankService (qwen3-rerank)
- Integrated DeepSeek V3 QueryRewriter for cross-language search
- Python service: Added pymupdf4llm for PDF-to-Markdown conversion
- PKB: Dual-mode adapter (pgvector/dify/hybrid)

Architecture:
- Brain-Hand Model: Business layer (DeepSeek) + Engine layer (pgvector)
- Cross-language support: Chinese query matches English documents
- Small Embedding (1024) + Strong Reranker strategy

Performance:
- End-to-end latency: 2.5s
- Cost per query: 0.0025 RMB
- Accuracy improvement: +20.5% (cross-language)

Tests:
- test-embedding-service.ts: Vector embedding verified
- test-rag-e2e.ts: Full pipeline tested
- test-rerank.ts: Rerank quality validated
- test-query-rewrite.ts: Cross-language search verified
- test-pdf-ingest.ts: Real PDF document tested (Dongen 2003.pdf)

Documentation:
- Added 05-RAG-Engine-User-Guide.md
- Added 02-Document-Processing-User-Guide.md
- Updated system status documentation

Status: Production ready
This commit is contained in:
2026-01-21 20:24:29 +08:00
parent 1f5bf2cd65
commit 40c2f8e148
338 changed files with 11014 additions and 1158 deletions

View File

@@ -0,0 +1,355 @@
"""
统一文档处理入口 - DocumentProcessor
功能:
- 自动检测文件类型
- 调用对应的处理器
- 统一输出 Markdown 格式
支持格式:
- PDF (.pdf) → pymupdf4llm
- Word (.docx) → mammoth
- Excel (.xlsx) → pandas
- CSV (.csv) → pandas
- PPT (.pptx) → python-pptx
- 纯文本 (.txt, .md) → 直接读取
"""
from pathlib import Path
from typing import Dict, Any, Optional
from loguru import logger
import chardet
class DocumentProcessor:
"""统一文档处理器"""
# 支持的文件类型
SUPPORTED_TYPES = {
'.pdf': 'pdf',
'.docx': 'word',
'.doc': 'word',
'.xlsx': 'excel',
'.xls': 'excel',
'.csv': 'csv',
'.pptx': 'ppt',
'.ppt': 'ppt',
'.txt': 'text',
'.md': 'text',
'.markdown': 'text',
}
def to_markdown(self, file_path: str) -> Dict[str, Any]:
"""
将文档转换为 Markdown
Args:
file_path: 文件路径
Returns:
{
"success": True,
"markdown": "Markdown 内容",
"file_type": "pdf",
"metadata": { ... }
}
"""
path = Path(file_path)
filename = path.name
suffix = path.suffix.lower()
# 检查文件类型
if suffix not in self.SUPPORTED_TYPES:
return {
"success": False,
"error": f"不支持的文件类型: {suffix}",
"supported_types": list(self.SUPPORTED_TYPES.keys())
}
file_type = self.SUPPORTED_TYPES[suffix]
logger.info(f"处理文档: {filename}, 类型: {file_type}")
try:
# 根据类型调用对应处理器
if file_type == 'pdf':
result = self._process_pdf(file_path)
elif file_type == 'word':
result = self._process_word(file_path)
elif file_type == 'excel':
result = self._process_excel(file_path)
elif file_type == 'csv':
result = self._process_csv(file_path)
elif file_type == 'ppt':
result = self._process_ppt(file_path)
elif file_type == 'text':
result = self._process_text(file_path)
else:
result = {
"success": False,
"error": f"未实现的处理器: {file_type}"
}
# 添加通用信息
if result.get("success"):
result["file_type"] = file_type
result["filename"] = filename
return result
except Exception as e:
logger.error(f"文档处理失败: {filename}, 错误: {e}")
return {
"success": False,
"error": str(e),
"file_type": file_type,
"filename": filename
}
def _process_pdf(self, file_path: str) -> Dict[str, Any]:
"""处理 PDF"""
from .pdf_markdown_processor import PdfMarkdownProcessor
processor = PdfMarkdownProcessor()
return processor.to_markdown(file_path)
def _process_word(self, file_path: str) -> Dict[str, Any]:
"""处理 Word 文档"""
import mammoth
try:
with open(file_path, "rb") as f:
result = mammoth.convert_to_markdown(f)
markdown = result.value
messages = result.messages
# 添加文件名上下文
filename = Path(file_path).name
markdown_with_context = f"## 文档: {filename}\n\n{markdown}"
return {
"success": True,
"markdown": markdown_with_context,
"metadata": {
"char_count": len(markdown),
"warnings": [str(m) for m in messages] if messages else []
}
}
except Exception as e:
return {
"success": False,
"error": str(e),
"markdown": f"> **系统提示**Word 文档解析失败: {str(e)}"
}
def _process_excel(self, file_path: str) -> Dict[str, Any]:
"""处理 Excel"""
import pandas as pd
try:
filename = Path(file_path).name
xlsx = pd.ExcelFile(file_path, engine='openpyxl')
md_parts = []
total_rows = 0
for sheet_name in xlsx.sheet_names:
df = pd.read_excel(xlsx, sheet_name=sheet_name)
rows = len(df)
total_rows += rows
# 添加 Sheet 信息
md_parts.append(f"## 数据: {filename} - {sheet_name}")
md_parts.append(f"- **行列**: {rows}× {len(df.columns)}\n")
# 截断大数据
max_rows = 200
if rows > max_rows:
md_parts.append(f"> ⚠️ 数据量较大,仅显示前 {max_rows} 行(共 {rows} 行)\n")
df = df.head(max_rows)
# 转换为 Markdown 表格
df = df.fillna('')
md_parts.append(df.to_markdown(index=False))
md_parts.append("\n---\n")
return {
"success": True,
"markdown": "\n".join(md_parts),
"metadata": {
"sheet_count": len(xlsx.sheet_names),
"total_rows": total_rows,
"sheets": xlsx.sheet_names
}
}
except Exception as e:
return {
"success": False,
"error": str(e),
"markdown": f"> **系统提示**Excel 文档解析失败: {str(e)}"
}
def _process_csv(self, file_path: str) -> Dict[str, Any]:
"""处理 CSV"""
import pandas as pd
try:
filename = Path(file_path).name
# 自动检测编码
with open(file_path, 'rb') as f:
raw = f.read(10000)
detected = chardet.detect(raw)
encoding = detected.get('encoding', 'utf-8')
df = pd.read_csv(file_path, encoding=encoding)
rows = len(df)
md_parts = []
md_parts.append(f"## 数据: {filename}")
md_parts.append(f"- **行列**: {rows}× {len(df.columns)}")
md_parts.append(f"- **编码**: {encoding}\n")
# 截断大数据
max_rows = 200
if rows > max_rows:
md_parts.append(f"> ⚠️ 数据量较大,仅显示前 {max_rows} 行(共 {rows} 行)\n")
df = df.head(max_rows)
df = df.fillna('')
md_parts.append(df.to_markdown(index=False))
return {
"success": True,
"markdown": "\n".join(md_parts),
"metadata": {
"row_count": rows,
"column_count": len(df.columns),
"encoding": encoding
}
}
except Exception as e:
return {
"success": False,
"error": str(e),
"markdown": f"> **系统提示**CSV 文件解析失败: {str(e)}"
}
def _process_ppt(self, file_path: str) -> Dict[str, Any]:
"""处理 PPT"""
from pptx import Presentation
try:
filename = Path(file_path).name
prs = Presentation(file_path)
md_parts = []
md_parts.append(f"## 演示文稿: {filename}\n")
for slide_num, slide in enumerate(prs.slides, 1):
md_parts.append(f"### 幻灯片 {slide_num}")
# 获取标题
if slide.shapes.title:
md_parts.append(f"**{slide.shapes.title.text}**\n")
# 获取所有文本
for shape in slide.shapes:
if shape.has_text_frame:
for para in shape.text_frame.paragraphs:
text = para.text.strip()
if text:
md_parts.append(f"- {text}")
md_parts.append("")
return {
"success": True,
"markdown": "\n".join(md_parts),
"metadata": {
"slide_count": len(prs.slides)
}
}
except Exception as e:
return {
"success": False,
"error": str(e),
"markdown": f"> **系统提示**PPT 文档解析失败: {str(e)}"
}
def _process_text(self, file_path: str) -> Dict[str, Any]:
"""处理纯文本"""
try:
filename = Path(file_path).name
# 自动检测编码
with open(file_path, 'rb') as f:
raw = f.read()
detected = chardet.detect(raw)
encoding = detected.get('encoding', 'utf-8')
with open(file_path, 'r', encoding=encoding) as f:
content = f.read()
# 如果是 .md 文件,直接返回
if file_path.endswith('.md') or file_path.endswith('.markdown'):
markdown = content
else:
# 纯文本添加文件名上下文
markdown = f"## 文档: {filename}\n\n{content}"
return {
"success": True,
"markdown": markdown,
"metadata": {
"char_count": len(content),
"encoding": encoding
}
}
except Exception as e:
return {
"success": False,
"error": str(e),
"markdown": f"> **系统提示**:文本文件读取失败: {str(e)}"
}
# 便捷函数
async def convert_to_markdown(file_path: str, file_type: Optional[str] = None) -> Dict[str, Any]:
"""
将文档转换为 Markdown便捷函数异步版本
Args:
file_path: 文件路径
file_type: 可选,指定文件类型(如不指定则自动检测)
Returns:
处理结果字典,格式:
{
"success": True,
"text": "Markdown 内容",
"format": "markdown",
"metadata": { ... }
}
"""
processor = DocumentProcessor()
result = processor.to_markdown(file_path)
# 转换输出格式以匹配 API 预期
if result.get("success"):
return {
"success": True,
"text": result.get("markdown", ""),
"format": "markdown",
"metadata": {
"original_file_type": result.get("file_type"),
"char_count": len(result.get("markdown", "")),
**result.get("metadata", {})
}
}
else:
return {
"success": False,
"error": result.get("error", "处理失败"),
"metadata": result.get("metadata", {})
}

View File

@@ -0,0 +1,146 @@
"""
PDF Markdown 处理器 - 基于 pymupdf4llm
特点:
- 输出 LLM 友好的 Markdown 格式
- 完整保留表格结构
- 自动检测扫描件并返回友好提示
- 零 OCR只处理电子版 PDF
"""
import pymupdf4llm
from pathlib import Path
from typing import Dict, Any, Optional, List
from loguru import logger
class PdfMarkdownProcessor:
"""PDF → Markdown 处理器"""
# 扫描件检测阈值:提取文本少于此字符数视为扫描件
MIN_TEXT_THRESHOLD = 50
def __init__(self, image_dir: str = "./images"):
self.image_dir = image_dir
def to_markdown(
self,
pdf_path: str,
page_chunks: bool = False,
extract_images: bool = False,
dpi: int = 150
) -> Dict[str, Any]:
"""
PDF 转 Markdown仅支持电子版
Args:
pdf_path: PDF 文件路径
page_chunks: 是否按页分块
extract_images: 是否提取图片(默认关闭,节省空间)
dpi: 图片分辨率
Returns:
{
"success": True,
"markdown": "Markdown 文本",
"metadata": { "page_count": 10, "char_count": 5000 },
"is_scanned": False
}
"""
filename = Path(pdf_path).name
try:
logger.info(f"开始使用 pymupdf4llm 处理: {filename}")
# 调用 pymupdf4llm
md_text = pymupdf4llm.to_markdown(
pdf_path,
page_chunks=page_chunks,
write_images=extract_images,
image_path=self.image_dir if extract_images else None,
dpi=dpi,
show_progress=False
)
# 如果返回的是列表page_chunks=True合并为字符串
if isinstance(md_text, list):
md_text = "\n\n---\n\n".join([
f"## Page {i+1}\n\n{page.get('text', '')}"
for i, page in enumerate(md_text)
])
char_count = len(md_text.strip())
# 质量检查:检测是否为扫描件
if char_count < self.MIN_TEXT_THRESHOLD:
logger.warning(f"PDF 文本过少 ({char_count} 字符),可能为扫描件: {filename}")
return {
"success": True,
"markdown": self._scan_pdf_hint(filename, char_count),
"metadata": {
"page_count": self._get_page_count(pdf_path),
"char_count": char_count,
"is_scanned": True
},
"is_scanned": True
}
# 获取页数
page_count = self._get_page_count(pdf_path)
logger.info(f"PDF 处理完成: {page_count} 页, {char_count} 字符")
return {
"success": True,
"markdown": md_text,
"metadata": {
"page_count": page_count,
"char_count": char_count,
"is_scanned": False
},
"is_scanned": False
}
except Exception as e:
logger.error(f"PDF 解析失败: {filename}, 错误: {e}")
return {
"success": False,
"error": str(e),
"markdown": f"> **系统提示**:文档 `{filename}` 解析失败: {str(e)}"
}
def _get_page_count(self, pdf_path: str) -> int:
"""获取 PDF 页数"""
try:
import fitz # pymupdf
doc = fitz.open(pdf_path)
count = len(doc)
doc.close()
return count
except:
return 0
def _scan_pdf_hint(self, filename: str, char_count: int) -> str:
"""生成扫描件友好提示"""
return f"""> **系统提示**:文档 `{filename}` 似乎是扫描件(图片型 PDF
>
> - 提取文本量:{char_count} 字符
> - 本系统暂不支持扫描版 PDF 的文字识别
> - 建议:请上传电子版 PDF或将扫描件转换为可编辑格式后重新上传"""
# 便捷函数
def extract_pdf_to_markdown(pdf_path: str) -> Dict[str, Any]:
"""
PDF 转 Markdown便捷函数
Args:
pdf_path: PDF 文件路径
Returns:
处理结果字典
"""
processor = PdfMarkdownProcessor()
return processor.to_markdown(pdf_path)

View File

@@ -1,17 +1,17 @@
"""
PDF处理主服务
实现顺序降级策略:
1. 检测语言
2. 中文PDF → PyMuPDF快速
3. 英文PDF → Nougat → 失败降级PyMuPDF
策略:
- 所有 PDF 统一使用 PyMuPDF 处理(快速、稳定)
- RAG 引擎推荐使用 pymupdf4llm见 pdf_markdown_processor.py
注意Nougat 已废弃,不再使用
"""
from typing import Dict, Any, Optional
from loguru import logger
from .language_detector import detect_language
from .nougat_extractor import extract_pdf_nougat, check_nougat_available
from .pdf_extractor import extract_pdf_pymupdf
@@ -20,22 +20,24 @@ def extract_pdf(
force_method: Optional[str] = None
) -> Dict[str, Any]:
"""
PDF提取主函数(顺序降级策略)
PDF提取主函数
处理流程:
1. 检测语言
2. 中文 → 直接PyMuPDF
3. 英文 → 尝试Nougat → 失败降级PyMuPDF
1. 检测语言(仅用于元数据)
2. 使用 PyMuPDF 提取文本
注意:对于 RAG 引擎,推荐使用 /api/document/to-markdown 接口,
它使用 pymupdf4llm 提供更好的表格和结构支持。
Args:
file_path: PDF文件路径
force_method: 强制使用的方法 ('nougat' | 'pymupdf')
force_method: 保留参数(已废弃,仅支持 'pymupdf'
Returns:
{
"success": True,
"method": "nougat" | "pymupdf",
"reason": "chinese_pdf" | "english_pdf" | "nougat_failed" | "nougat_low_quality",
"method": "pymupdf",
"reason": "...",
"text": "提取的文本",
"metadata": {...}
}
@@ -43,97 +45,31 @@ def extract_pdf(
try:
logger.info(f"开始处理PDF: {file_path}")
# Step 1: 语言检测
# Step 1: 语言检测(仅用于元数据)
logger.info("[Step 1] 检测PDF语言...")
language = detect_language(file_path)
logger.info(f"检测结果: {language}")
# 如果强制指定方法
if force_method:
logger.info(f"强制使用方法: {force_method}")
if force_method == 'nougat':
return extract_pdf_nougat(file_path)
elif force_method == 'pymupdf':
result = extract_pdf_pymupdf(file_path)
result['reason'] = 'force_pymupdf'
return result
# Step 2: 中文PDF → 直接PyMuPDF
if language == 'chinese':
logger.info("[Step 2] 中文PDF使用PyMuPDF快速处理")
result = extract_pdf_pymupdf(file_path)
if result['success']:
result['reason'] = 'chinese_pdf'
result['detected_language'] = language
logger.info("✅ PyMuPDF处理成功中文PDF")
return result
else:
logger.error("❌ PyMuPDF处理失败")
return result
# Step 3: 英文PDF → 尝试Nougat
logger.info("[Step 3] 英文PDF尝试Nougat高质量解析")
# 检查Nougat是否可用
if not check_nougat_available():
logger.warning("⚠️ Nougat不可用降级到PyMuPDF")
result = extract_pdf_pymupdf(file_path)
if result['success']:
result['reason'] = 'nougat_unavailable'
result['detected_language'] = language
return result
# 尝试Nougat
try:
nougat_result = extract_pdf_nougat(file_path)
if not nougat_result['success']:
logger.warning("⚠️ Nougat提取失败降级到PyMuPDF")
raise Exception(nougat_result.get('error', 'Nougat failed'))
# 质量检查
quality_score = nougat_result['metadata'].get('quality_score', 0)
logger.info(f"Nougat质量评分: {quality_score:.2f}")
# 质量阈值0.7
if quality_score >= 0.7:
logger.info("✅ Nougat处理成功质量合格")
nougat_result['reason'] = 'english_pdf_high_quality'
nougat_result['detected_language'] = language
return nougat_result
else:
logger.warning(f"⚠️ Nougat质量不足: {quality_score:.2f}降级到PyMuPDF")
raise Exception(f"Quality too low: {quality_score}")
except Exception as e:
logger.warning(f"Nougat处理失败: {str(e)}降级到PyMuPDF")
# Step 4: 降级到PyMuPDF
logger.info("[Step 4] 降级使用PyMuPDF")
# Step 2: 使用 PyMuPDF 提取
logger.info("[Step 2] 使用PyMuPDF处理")
result = extract_pdf_pymupdf(file_path)
if result['success']:
result['reason'] = 'nougat_failed_or_low_quality'
result['reason'] = 'pymupdf_standard'
result['detected_language'] = language
result['fallback'] = True
logger.info("✅ PyMuPDF处理成功降级方案")
logger.info("✅ PyMuPDF处理成功")
else:
logger.error("❌ PyMuPDF处理失败")
logger.error("❌ PyMuPDF处理失败")
return result
except Exception as e:
logger.error(f"PDF处理完全失败: {str(e)}")
logger.error(f"PDF处理失败: {str(e)}")
return {
"success": False,
"error": str(e),
"method": "unknown"
"method": "pymupdf"
}
@@ -149,34 +85,20 @@ def get_pdf_processing_strategy(file_path: str) -> Dict[str, Any]:
Returns:
{
"detected_language": "chinese" | "english",
"recommended_method": "nougat" | "pymupdf",
"recommended_method": "pymupdf",
"reason": "...",
"nougat_available": True | False
"nougat_available": False # 已废弃
}
"""
try:
# 检测语言
language = detect_language(file_path)
# 检查Nougat可用性
nougat_available = check_nougat_available()
# 决定策略
if language == 'chinese':
recommended_method = 'pymupdf'
reason = '中文PDF推荐使用PyMuPDF快速处理'
elif nougat_available:
recommended_method = 'nougat'
reason = '英文PDF推荐使用Nougat高质量解析'
else:
recommended_method = 'pymupdf'
reason = 'Nougat不可用使用PyMuPDF'
return {
"detected_language": language,
"recommended_method": recommended_method,
"reason": reason,
"nougat_available": nougat_available
"recommended_method": "pymupdf",
"reason": "统一使用 PyMuPDF 处理RAG 引擎推荐使用 /api/document/to-markdown",
"nougat_available": False # 已废弃
}
except Exception as e: