feat(rag): Complete RAG engine implementation with pgvector

Major Features: - Created ekb_schema (13th schema) with 3 tables: KB/Document/Chunk - Implemented EmbeddingService (text-embedding-v4, 1024-dim vectors) - Implemented ChunkService (smart Markdown chunking) - Implemented VectorSearchService (multi-query + hybrid search) - Implemented RerankService (qwen3-rerank) - Integrated DeepSeek V3 QueryRewriter for cross-language search - Python service: Added pymupdf4llm for PDF-to-Markdown conversion - PKB: Dual-mode adapter (pgvector/dify/hybrid) Architecture: - Brain-Hand Model: Business layer (DeepSeek) + Engine layer (pgvector) - Cross-language support: Chinese query matches English documents - Small Embedding (1024) + Strong Reranker strategy Performance: - End-to-end latency: 2.5s - Cost per query: 0.0025 RMB - Accuracy improvement: +20.5% (cross-language) Tests: - test-embedding-service.ts: Vector embedding verified - test-rag-e2e.ts: Full pipeline tested - test-rerank.ts: Rerank quality validated - test-query-rewrite.ts: Cross-language search verified - test-pdf-ingest.ts: Real PDF document tested (Dongen 2003.pdf) Documentation: - Added 05-RAG-Engine-User-Guide.md - Added 02-Document-Processing-User-Guide.md - Updated system status documentation Status: Production ready
2026-01-21 20:24:29 +08:00
parent 1f5bf2cd65
commit 40c2f8e148
338 changed files with 11014 additions and 1158 deletions
--- a/extraction_service/.dockerignore
+++ b/extraction_service/.dockerignore
@@ -79,6 +79,9 @@ models/



+
+
+



--- a/extraction_service/main.py
+++ b/extraction_service/main.py
@@ -56,11 +56,17 @@ TEMP_DIR.mkdir(parents=True, exist_ok=True)
 from services.pdf_extractor import extract_pdf_pymupdf
 from services.pdf_processor import extract_pdf, get_pdf_processing_strategy
 from services.language_detector import detect_language, detect_language_detailed
-from services.nougat_extractor import check_nougat_available, get_nougat_info
 from services.file_utils import detect_file_type, cleanup_temp_file
 from services.docx_extractor import extract_docx_mammoth, validate_docx_file
 from services.txt_extractor import extract_txt, validate_txt_file
 from services.dc_executor import validate_code, execute_pandas_code
+# 新增：统一文档处理器（RAG 引擎使用）
+from services.document_processor import DocumentProcessor, convert_to_markdown
+from services.pdf_markdown_processor import PdfMarkdownProcessor, extract_pdf_to_markdown
+
+# 兼容：nougat 相关（已废弃，保留空实现避免报错）
+def check_nougat_available(): return False
+def get_nougat_info(): return {"available": False, "reason": "已废弃，使用 pymupdf4llm 替代"}

 # ✨ 导入预写的数据操作函数
 from operations.filter import apply_filter
@@ -661,6 +667,72 @@ async def extract_document(
        )


+# ==================== RAG 引擎 - 文档转 Markdown 接口 ====================
+
+@app.post("/api/document/to-markdown")
+async def document_to_markdown(
+    file: UploadFile = File(...),
+    file_type: Optional[str] = None
+):
+    """
+    RAG 引擎 - 文档转 Markdown 接口
+    
+    将各种格式的文档（PDF、Word、TXT 等）转换为 LLM 友好的 Markdown 格式。
+    这是知识库引擎的核心文档处理接口。
+    
+    Args:
+        file: 上传的文件
+        file_type: 可选，指定文件类型 ('pdf' | 'docx' | 'txt' | 'md')
+    
+    Returns:
+        {
+            "success": true,
+            "text": "# 文档标题\\n\\n文档内容...",
+            "format": "markdown",
+            "metadata": {
+                "original_file_type": "pdf",
+                "char_count": 12345,
+                "filename": "example.pdf"
+            }
+        }
+    
+    Raises:
+        400: 不支持的文件格式
+        500: 处理失败
+    """
+    temp_path = None
+    try:
+        # 保存上传的文件到临时目录
+        temp_path = TEMP_DIR / file.filename
+        with open(temp_path, "wb") as f:
+            content = await file.read()
+            f.write(content)
+        
+        logger.info(f"RAG 文档处理: {file.filename}, 大小: {len(content)} bytes")
+        
+        # 调用统一文档处理器
+        result = await convert_to_markdown(str(temp_path), file_type)
+        
+        # 补充文件名到 metadata
+        if result.get("metadata"):
+            result["metadata"]["filename"] = file.filename
+        else:
+            result["metadata"] = {"filename": file.filename}
+        
+        return JSONResponse(content=result)
+    
+    except ValueError as e:
+        logger.warning(f"文档格式不支持: {file.filename}, 错误: {e}")
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        logger.error(f"文档转 Markdown 失败: {file.filename}, 错误: {e}")
+        raise HTTPException(status_code=500, detail=f"处理失败: {str(e)}")
+    finally:
+        # 清理临时文件
+        if temp_path and temp_path.exists():
+            cleanup_temp_file(str(temp_path))
+
+
 # ==================== DC工具C - 代码执行接口 ====================

@app.post("/api/dc/validate")
--- a/extraction_service/operations/init.py
+++ b/extraction_service/operations/init.py
@@ -67,6 +67,9 @@ __version__ = '1.0.0'



+
+
+



--- a/extraction_service/operations/dropna.py
+++ b/extraction_service/operations/dropna.py
@@ -200,6 +200,9 @@ def get_missing_summary(df: pd.DataFrame) -> dict:



+
+
+



--- a/extraction_service/operations/filter.py
+++ b/extraction_service/operations/filter.py
@@ -160,6 +160,9 @@ def apply_filter(



+
+
+



--- a/extraction_service/operations/unpivot.py
+++ b/extraction_service/operations/unpivot.py
@@ -324,6 +324,9 @@ def get_unpivot_preview(



+
+
+



--- a/extraction_service/requirements.txt
+++ b/extraction_service/requirements.txt
@@ -3,25 +3,31 @@ fastapi==0.104.1
 uvicorn[standard]==0.24.0
 python-multipart==0.0.6

-# PDF处理
-PyMuPDF>=1.24.0  # 使用更新版本，有预编译wheel
-pdfplumber==0.10.3
-nougat-ocr==0.1.17  # 学术PDF高质量提取（英文）
-albumentations==1.3.1  # Nougat兼容版本（不要升级到2.x）
+# PDF处理 - 使用 pymupdf4llm（替代 nougat，更轻量）
+pymupdf4llm>=0.0.17         # PDF → Markdown，自动包含 pymupdf
+pdfplumber==0.10.3          # 备用 PDF 处理

-# Docx处理（Day 3需要）
-mammoth==1.6.0
-python-docx==1.1.0
+# Word处理
+mammoth==1.6.0              # Docx → Markdown
+python-docx==1.1.0          # Docx 读取

-# 语言检测（Day 2需要）
+# Excel/CSV处理
+pandas>=2.0.0               # 表格处理
+openpyxl>=3.1.2             # Excel 读取
+tabulate>=0.9.0             # DataFrame → Markdown
+
+# PPT处理
+python-pptx>=0.6.23         # PPT 读取
+
+# 语言检测
 langdetect==1.0.9

-# 编码检测（Day 3需要）
+# 编码检测
 chardet==5.2.0

 # 工具
 python-dotenv==1.0.0
-pydantic>=2.10.0  # 使用更新版本，有预编译wheel
+pydantic>=2.10.0

 # 日志
 loguru==0.7.2
--- a/extraction_service/services/document_processor.py
+++ b/extraction_service/services/document_processor.py
@@ -0,0 +1,355 @@
+"""
+统一文档处理入口 - DocumentProcessor
+
+功能：
+- 自动检测文件类型
+- 调用对应的处理器
+- 统一输出 Markdown 格式
+
+支持格式：
+- PDF (.pdf) → pymupdf4llm
+- Word (.docx) → mammoth
+- Excel (.xlsx) → pandas
+- CSV (.csv) → pandas
+- PPT (.pptx) → python-pptx
+- 纯文本 (.txt, .md) → 直接读取
+"""
+
+from pathlib import Path
+from typing import Dict, Any, Optional
+from loguru import logger
+import chardet
+
+
+class DocumentProcessor:
+    """统一文档处理器"""
+    
+    # 支持的文件类型
+    SUPPORTED_TYPES = {
+        '.pdf': 'pdf',
+        '.docx': 'word',
+        '.doc': 'word',
+        '.xlsx': 'excel',
+        '.xls': 'excel',
+        '.csv': 'csv',
+        '.pptx': 'ppt',
+        '.ppt': 'ppt',
+        '.txt': 'text',
+        '.md': 'text',
+        '.markdown': 'text',
+    }
+    
+    def to_markdown(self, file_path: str) -> Dict[str, Any]:
+        """
+        将文档转换为 Markdown
+        
+        Args:
+            file_path: 文件路径
+        
+        Returns:
+            {
+                "success": True,
+                "markdown": "Markdown 内容",
+                "file_type": "pdf",
+                "metadata": { ... }
+            }
+        """
+        path = Path(file_path)
+        filename = path.name
+        suffix = path.suffix.lower()
+        
+        # 检查文件类型
+        if suffix not in self.SUPPORTED_TYPES:
+            return {
+                "success": False,
+                "error": f"不支持的文件类型: {suffix}",
+                "supported_types": list(self.SUPPORTED_TYPES.keys())
+            }
+        
+        file_type = self.SUPPORTED_TYPES[suffix]
+        logger.info(f"处理文档: {filename}, 类型: {file_type}")
+        
+        try:
+            # 根据类型调用对应处理器
+            if file_type == 'pdf':
+                result = self._process_pdf(file_path)
+            elif file_type == 'word':
+                result = self._process_word(file_path)
+            elif file_type == 'excel':
+                result = self._process_excel(file_path)
+            elif file_type == 'csv':
+                result = self._process_csv(file_path)
+            elif file_type == 'ppt':
+                result = self._process_ppt(file_path)
+            elif file_type == 'text':
+                result = self._process_text(file_path)
+            else:
+                result = {
+                    "success": False,
+                    "error": f"未实现的处理器: {file_type}"
+                }
+            
+            # 添加通用信息
+            if result.get("success"):
+                result["file_type"] = file_type
+                result["filename"] = filename
+            
+            return result
+            
+        except Exception as e:
+            logger.error(f"文档处理失败: {filename}, 错误: {e}")
+            return {
+                "success": False,
+                "error": str(e),
+                "file_type": file_type,
+                "filename": filename
+            }
+    
+    def _process_pdf(self, file_path: str) -> Dict[str, Any]:
+        """处理 PDF"""
+        from .pdf_markdown_processor import PdfMarkdownProcessor
+        
+        processor = PdfMarkdownProcessor()
+        return processor.to_markdown(file_path)
+    
+    def _process_word(self, file_path: str) -> Dict[str, Any]:
+        """处理 Word 文档"""
+        import mammoth
+        
+        try:
+            with open(file_path, "rb") as f:
+                result = mammoth.convert_to_markdown(f)
+                markdown = result.value
+                messages = result.messages
+            
+            # 添加文件名上下文
+            filename = Path(file_path).name
+            markdown_with_context = f"## 文档: {filename}\n\n{markdown}"
+            
+            return {
+                "success": True,
+                "markdown": markdown_with_context,
+                "metadata": {
+                    "char_count": len(markdown),
+                    "warnings": [str(m) for m in messages] if messages else []
+                }
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": str(e),
+                "markdown": f"> **系统提示**：Word 文档解析失败: {str(e)}"
+            }
+    
+    def _process_excel(self, file_path: str) -> Dict[str, Any]:
+        """处理 Excel"""
+        import pandas as pd
+        
+        try:
+            filename = Path(file_path).name
+            xlsx = pd.ExcelFile(file_path, engine='openpyxl')
+            
+            md_parts = []
+            total_rows = 0
+            
+            for sheet_name in xlsx.sheet_names:
+                df = pd.read_excel(xlsx, sheet_name=sheet_name)
+                rows = len(df)
+                total_rows += rows
+                
+                # 添加 Sheet 信息
+                md_parts.append(f"## 数据: {filename} - {sheet_name}")
+                md_parts.append(f"- **行列**: {rows} 行 × {len(df.columns)} 列\n")
+                
+                # 截断大数据
+                max_rows = 200
+                if rows > max_rows:
+                    md_parts.append(f"> ⚠️ 数据量较大，仅显示前 {max_rows} 行（共 {rows} 行）\n")
+                    df = df.head(max_rows)
+                
+                # 转换为 Markdown 表格
+                df = df.fillna('')
+                md_parts.append(df.to_markdown(index=False))
+                md_parts.append("\n---\n")
+            
+            return {
+                "success": True,
+                "markdown": "\n".join(md_parts),
+                "metadata": {
+                    "sheet_count": len(xlsx.sheet_names),
+                    "total_rows": total_rows,
+                    "sheets": xlsx.sheet_names
+                }
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": str(e),
+                "markdown": f"> **系统提示**：Excel 文档解析失败: {str(e)}"
+            }
+    
+    def _process_csv(self, file_path: str) -> Dict[str, Any]:
+        """处理 CSV"""
+        import pandas as pd
+        
+        try:
+            filename = Path(file_path).name
+            
+            # 自动检测编码
+            with open(file_path, 'rb') as f:
+                raw = f.read(10000)
+                detected = chardet.detect(raw)
+                encoding = detected.get('encoding', 'utf-8')
+            
+            df = pd.read_csv(file_path, encoding=encoding)
+            rows = len(df)
+            
+            md_parts = []
+            md_parts.append(f"## 数据: {filename}")
+            md_parts.append(f"- **行列**: {rows} 行 × {len(df.columns)} 列")
+            md_parts.append(f"- **编码**: {encoding}\n")
+            
+            # 截断大数据
+            max_rows = 200
+            if rows > max_rows:
+                md_parts.append(f"> ⚠️ 数据量较大，仅显示前 {max_rows} 行（共 {rows} 行）\n")
+                df = df.head(max_rows)
+            
+            df = df.fillna('')
+            md_parts.append(df.to_markdown(index=False))
+            
+            return {
+                "success": True,
+                "markdown": "\n".join(md_parts),
+                "metadata": {
+                    "row_count": rows,
+                    "column_count": len(df.columns),
+                    "encoding": encoding
+                }
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": str(e),
+                "markdown": f"> **系统提示**：CSV 文件解析失败: {str(e)}"
+            }
+    
+    def _process_ppt(self, file_path: str) -> Dict[str, Any]:
+        """处理 PPT"""
+        from pptx import Presentation
+        
+        try:
+            filename = Path(file_path).name
+            prs = Presentation(file_path)
+            
+            md_parts = []
+            md_parts.append(f"## 演示文稿: {filename}\n")
+            
+            for slide_num, slide in enumerate(prs.slides, 1):
+                md_parts.append(f"### 幻灯片 {slide_num}")
+                
+                # 获取标题
+                if slide.shapes.title:
+                    md_parts.append(f"**{slide.shapes.title.text}**\n")
+                
+                # 获取所有文本
+                for shape in slide.shapes:
+                    if shape.has_text_frame:
+                        for para in shape.text_frame.paragraphs:
+                            text = para.text.strip()
+                            if text:
+                                md_parts.append(f"- {text}")
+                
+                md_parts.append("")
+            
+            return {
+                "success": True,
+                "markdown": "\n".join(md_parts),
+                "metadata": {
+                    "slide_count": len(prs.slides)
+                }
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": str(e),
+                "markdown": f"> **系统提示**：PPT 文档解析失败: {str(e)}"
+            }
+    
+    def _process_text(self, file_path: str) -> Dict[str, Any]:
+        """处理纯文本"""
+        try:
+            filename = Path(file_path).name
+            
+            # 自动检测编码
+            with open(file_path, 'rb') as f:
+                raw = f.read()
+                detected = chardet.detect(raw)
+                encoding = detected.get('encoding', 'utf-8')
+            
+            with open(file_path, 'r', encoding=encoding) as f:
+                content = f.read()
+            
+            # 如果是 .md 文件，直接返回
+            if file_path.endswith('.md') or file_path.endswith('.markdown'):
+                markdown = content
+            else:
+                # 纯文本添加文件名上下文
+                markdown = f"## 文档: {filename}\n\n{content}"
+            
+            return {
+                "success": True,
+                "markdown": markdown,
+                "metadata": {
+                    "char_count": len(content),
+                    "encoding": encoding
+                }
+            }
+        except Exception as e:
+            return {
+                "success": False,
+                "error": str(e),
+                "markdown": f"> **系统提示**：文本文件读取失败: {str(e)}"
+            }
+
+
+# 便捷函数
+async def convert_to_markdown(file_path: str, file_type: Optional[str] = None) -> Dict[str, Any]:
+    """
+    将文档转换为 Markdown（便捷函数，异步版本）
+    
+    Args:
+        file_path: 文件路径
+        file_type: 可选，指定文件类型（如不指定则自动检测）
+    
+    Returns:
+        处理结果字典，格式：
+        {
+            "success": True,
+            "text": "Markdown 内容",
+            "format": "markdown",
+            "metadata": { ... }
+        }
+    """
+    processor = DocumentProcessor()
+    result = processor.to_markdown(file_path)
+    
+    # 转换输出格式以匹配 API 预期
+    if result.get("success"):
+        return {
+            "success": True,
+            "text": result.get("markdown", ""),
+            "format": "markdown",
+            "metadata": {
+                "original_file_type": result.get("file_type"),
+                "char_count": len(result.get("markdown", "")),
+                **result.get("metadata", {})
+            }
+        }
+    else:
+        return {
+            "success": False,
+            "error": result.get("error", "处理失败"),
+            "metadata": result.get("metadata", {})
+        }
+
--- a/extraction_service/services/pdf_markdown_processor.py
+++ b/extraction_service/services/pdf_markdown_processor.py
@@ -0,0 +1,146 @@
+"""
+PDF Markdown 处理器 - 基于 pymupdf4llm
+
+特点：
+- 输出 LLM 友好的 Markdown 格式
+- 完整保留表格结构
+- 自动检测扫描件并返回友好提示
+- 零 OCR，只处理电子版 PDF
+"""
+
+import pymupdf4llm
+from pathlib import Path
+from typing import Dict, Any, Optional, List
+from loguru import logger
+
+
+class PdfMarkdownProcessor:
+    """PDF → Markdown 处理器"""
+    
+    # 扫描件检测阈值：提取文本少于此字符数视为扫描件
+    MIN_TEXT_THRESHOLD = 50
+    
+    def __init__(self, image_dir: str = "./images"):
+        self.image_dir = image_dir
+    
+    def to_markdown(
+        self,
+        pdf_path: str,
+        page_chunks: bool = False,
+        extract_images: bool = False,
+        dpi: int = 150
+    ) -> Dict[str, Any]:
+        """
+        PDF 转 Markdown（仅支持电子版）
+        
+        Args:
+            pdf_path: PDF 文件路径
+            page_chunks: 是否按页分块
+            extract_images: 是否提取图片（默认关闭，节省空间）
+            dpi: 图片分辨率
+        
+        Returns:
+            {
+                "success": True,
+                "markdown": "Markdown 文本",
+                "metadata": { "page_count": 10, "char_count": 5000 },
+                "is_scanned": False
+            }
+        """
+        filename = Path(pdf_path).name
+        
+        try:
+            logger.info(f"开始使用 pymupdf4llm 处理: {filename}")
+            
+            # 调用 pymupdf4llm
+            md_text = pymupdf4llm.to_markdown(
+                pdf_path,
+                page_chunks=page_chunks,
+                write_images=extract_images,
+                image_path=self.image_dir if extract_images else None,
+                dpi=dpi,
+                show_progress=False
+            )
+            
+            # 如果返回的是列表（page_chunks=True），合并为字符串
+            if isinstance(md_text, list):
+                md_text = "\n\n---\n\n".join([
+                    f"## Page {i+1}\n\n{page.get('text', '')}" 
+                    for i, page in enumerate(md_text)
+                ])
+            
+            char_count = len(md_text.strip())
+            
+            # 质量检查：检测是否为扫描件
+            if char_count < self.MIN_TEXT_THRESHOLD:
+                logger.warning(f"PDF 文本过少 ({char_count} 字符)，可能为扫描件: {filename}")
+                return {
+                    "success": True,
+                    "markdown": self._scan_pdf_hint(filename, char_count),
+                    "metadata": {
+                        "page_count": self._get_page_count(pdf_path),
+                        "char_count": char_count,
+                        "is_scanned": True
+                    },
+                    "is_scanned": True
+                }
+            
+            # 获取页数
+            page_count = self._get_page_count(pdf_path)
+            
+            logger.info(f"PDF 处理完成: {page_count} 页, {char_count} 字符")
+            
+            return {
+                "success": True,
+                "markdown": md_text,
+                "metadata": {
+                    "page_count": page_count,
+                    "char_count": char_count,
+                    "is_scanned": False
+                },
+                "is_scanned": False
+            }
+            
+        except Exception as e:
+            logger.error(f"PDF 解析失败: {filename}, 错误: {e}")
+            return {
+                "success": False,
+                "error": str(e),
+                "markdown": f"> **系统提示**：文档 `{filename}` 解析失败: {str(e)}"
+            }
+    
+    def _get_page_count(self, pdf_path: str) -> int:
+        """获取 PDF 页数"""
+        try:
+            import fitz  # pymupdf
+            doc = fitz.open(pdf_path)
+            count = len(doc)
+            doc.close()
+            return count
+        except:
+            return 0
+    
+    def _scan_pdf_hint(self, filename: str, char_count: int) -> str:
+        """生成扫描件友好提示"""
+        return f"""> **系统提示**：文档 `{filename}` 似乎是扫描件（图片型 PDF）。
+> 
+> - 提取文本量：{char_count} 字符
+> - 本系统暂不支持扫描版 PDF 的文字识别
+> - 建议：请上传电子版 PDF，或将扫描件转换为可编辑格式后重新上传"""
+
+
+# 便捷函数
+def extract_pdf_to_markdown(pdf_path: str) -> Dict[str, Any]:
+    """
+    PDF 转 Markdown（便捷函数）
+    
+    Args:
+        pdf_path: PDF 文件路径
+    
+    Returns:
+        处理结果字典
+    """
+    processor = PdfMarkdownProcessor()
+    return processor.to_markdown(pdf_path)
+
+
--- a/extraction_service/services/pdf_processor.py
+++ b/extraction_service/services/pdf_processor.py
@@ -1,17 +1,17 @@
 """
 PDF处理主服务

-实现顺序降级策略：
-1. 检测语言
-2. 中文PDF → PyMuPDF（快速）
-3. 英文PDF → Nougat → 失败降级PyMuPDF
+策略：
+- 所有 PDF 统一使用 PyMuPDF 处理（快速、稳定）
+- RAG 引擎推荐使用 pymupdf4llm（见 pdf_markdown_processor.py）
+
+注意：Nougat 已废弃，不再使用
 """

 from typing import Dict, Any, Optional
 from loguru import logger

 from .language_detector import detect_language
-from .nougat_extractor import extract_pdf_nougat, check_nougat_available
 from .pdf_extractor import extract_pdf_pymupdf


@@ -20,22 +20,24 @@ def extract_pdf(
    force_method: Optional[str] = None
 ) -> Dict[str, Any]:
    """
-    PDF提取主函数（顺序降级策略）
+    PDF提取主函数
    
    处理流程：
-    1. 检测语言
-    2. 中文 → 直接PyMuPDF
-    3. 英文 → 尝试Nougat → 失败降级PyMuPDF
+    1. 检测语言（仅用于元数据）
+    2. 使用 PyMuPDF 提取文本
+    
+    注意：对于 RAG 引擎，推荐使用 /api/document/to-markdown 接口，
+    它使用 pymupdf4llm 提供更好的表格和结构支持。
    
    Args:
        file_path: PDF文件路径
-        force_method: 强制使用的方法 ('nougat' | 'pymupdf')
+        force_method: 保留参数（已废弃，仅支持 'pymupdf'）
    
    Returns:
        {
            "success": True,
-            "method": "nougat" | "pymupdf",
-            "reason": "chinese_pdf" | "english_pdf" | "nougat_failed" | "nougat_low_quality",
+            "method": "pymupdf",
+            "reason": "...",
            "text": "提取的文本",
            "metadata": {...}
        }
@@ -43,97 +45,31 @@ def extract_pdf(
    try:
        logger.info(f"开始处理PDF: {file_path}")
        
-        # Step 1: 语言检测
+        # Step 1: 语言检测（仅用于元数据）
        logger.info("[Step 1] 检测PDF语言...")
        language = detect_language(file_path)
        logger.info(f"检测结果: {language}")
        
-        # 如果强制指定方法
-        if force_method:
-            logger.info(f"强制使用方法: {force_method}")
-            
-            if force_method == 'nougat':
-                return extract_pdf_nougat(file_path)
-            elif force_method == 'pymupdf':
-                result = extract_pdf_pymupdf(file_path)
-                result['reason'] = 'force_pymupdf'
-                return result
-        
-        # Step 2: 中文PDF → 直接PyMuPDF
-        if language == 'chinese':
-            logger.info("[Step 2] 中文PDF，使用PyMuPDF快速处理")
-            
-            result = extract_pdf_pymupdf(file_path)
-            
-            if result['success']:
-                result['reason'] = 'chinese_pdf'
-                result['detected_language'] = language
-                logger.info("✅ PyMuPDF处理成功（中文PDF）")
-                return result
-            else:
-                logger.error("❌ PyMuPDF处理失败")
-                return result
-        
-        # Step 3: 英文PDF → 尝试Nougat
-        logger.info("[Step 3] 英文PDF，尝试Nougat高质量解析")
-        
-        # 检查Nougat是否可用
-        if not check_nougat_available():
-            logger.warning("⚠️ Nougat不可用，降级到PyMuPDF")
-            
-            result = extract_pdf_pymupdf(file_path)
-            if result['success']:
-                result['reason'] = 'nougat_unavailable'
-                result['detected_language'] = language
-            return result
-        
-        # 尝试Nougat
-        try:
-            nougat_result = extract_pdf_nougat(file_path)
-            
-            if not nougat_result['success']:
-                logger.warning("⚠️ Nougat提取失败，降级到PyMuPDF")
-                raise Exception(nougat_result.get('error', 'Nougat failed'))
-            
-            # 质量检查
-            quality_score = nougat_result['metadata'].get('quality_score', 0)
-            
-            logger.info(f"Nougat质量评分: {quality_score:.2f}")
-            
-            # 质量阈值：0.7
-            if quality_score >= 0.7:
-                logger.info("✅ Nougat处理成功（质量合格）")
-                nougat_result['reason'] = 'english_pdf_high_quality'
-                nougat_result['detected_language'] = language
-                return nougat_result
-            else:
-                logger.warning(f"⚠️ Nougat质量不足: {quality_score:.2f}，降级到PyMuPDF")
-                raise Exception(f"Quality too low: {quality_score}")
-        
-        except Exception as e:
-            logger.warning(f"Nougat处理失败: {str(e)}，降级到PyMuPDF")
-        
-        # Step 4: 降级到PyMuPDF
-        logger.info("[Step 4] 降级使用PyMuPDF")
+        # Step 2: 使用 PyMuPDF 提取
+        logger.info("[Step 2] 使用PyMuPDF处理")
        
        result = extract_pdf_pymupdf(file_path)
        
        if result['success']:
-            result['reason'] = 'nougat_failed_or_low_quality'
+            result['reason'] = 'pymupdf_standard'
            result['detected_language'] = language
-            result['fallback'] = True
-            logger.info("✅ PyMuPDF处理成功（降级方案）")
+            logger.info("✅ PyMuPDF处理成功")
        else:
-            logger.error("❌ PyMuPDF处理也失败了")
+            logger.error("❌ PyMuPDF处理失败")
        
        return result
    
    except Exception as e:
-        logger.error(f"PDF处理完全失败: {str(e)}")
+        logger.error(f"PDF处理失败: {str(e)}")
        return {
            "success": False,
            "error": str(e),
-            "method": "unknown"
+            "method": "pymupdf"
        }


@@ -149,34 +85,20 @@ def get_pdf_processing_strategy(file_path: str) -> Dict[str, Any]:
    Returns:
        {
            "detected_language": "chinese" | "english",
-            "recommended_method": "nougat" | "pymupdf",
+            "recommended_method": "pymupdf",
            "reason": "...",
-            "nougat_available": True | False
+            "nougat_available": False  # 已废弃
        }
    """
    try:
        # 检测语言
        language = detect_language(file_path)
        
-        # 检查Nougat可用性
-        nougat_available = check_nougat_available()
-        
-        # 决定策略
-        if language == 'chinese':
-            recommended_method = 'pymupdf'
-            reason = '中文PDF，推荐使用PyMuPDF快速处理'
-        elif nougat_available:
-            recommended_method = 'nougat'
-            reason = '英文PDF，推荐使用Nougat高质量解析'
-        else:
-            recommended_method = 'pymupdf'
-            reason = 'Nougat不可用，使用PyMuPDF'
-        
        return {
            "detected_language": language,
-            "recommended_method": recommended_method,
-            "reason": reason,
-            "nougat_available": nougat_available
+            "recommended_method": "pymupdf",
+            "reason": "统一使用 PyMuPDF 处理（RAG 引擎推荐使用 /api/document/to-markdown）",
+            "nougat_available": False  # 已废弃
        }
    
    except Exception as e:
--- a/extraction_service/test_dc_api.py
+++ b/extraction_service/test_dc_api.py
@@ -334,6 +334,9 @@ if __name__ == "__main__":



+
+
+



--- a/extraction_service/test_execute_simple.py
+++ b/extraction_service/test_execute_simple.py
@@ -100,6 +100,9 @@ except Exception as e:



+
+
+



--- a/extraction_service/test_module.py
+++ b/extraction_service/test_module.py
@@ -80,6 +80,9 @@ except Exception as e:



+
+
+
				`@@ -200,6 +200,9 @@ def get_missing_summary(df: pd.DataFrame) -> dict:`