feat: add extraction_service (PDF/Docx/Txt) and update .gitignore to exclude venv

2025-11-16 15:32:44 +08:00
parent 2a4f59b08b
commit 39eb62ee79
18 changed files with 2706 additions and 0 deletions
--- a/extraction_service/services/pdf_extractor.py
+++ b/extraction_service/services/pdf_extractor.py
@@ -0,0 +1,191 @@
+"""
+PDF文本提取服务
+
+使用PyMuPDF (fitz)提取PDF文本内容
+"""
+
+import fitz  # PyMuPDF
+from typing import Dict, Any
+from loguru import logger
+
+
+def extract_pdf_pymupdf(file_path: str) -> Dict[str, Any]:
+    """
+    使用PyMuPDF提取PDF文本
+    
+    Args:
+        file_path: PDF文件路径
+    
+    Returns:
+        {
+            "success": True,
+            "method": "pymupdf",
+            "text": "提取的文本",
+            "metadata": {
+                "page_count": 20,
+                "char_count": 50000,
+                "has_text": True
+            }
+        }
+    """
+    try:
+        logger.info(f"开始使用PyMuPDF提取: {file_path}")
+        
+        # 打开PDF
+        doc = fitz.open(file_path)
+        page_count = len(doc)
+        
+        logger.info(f"PDF页数: {page_count}")
+        
+        # 提取所有页面的文本
+        text_parts = []
+        
+        for page_num in range(page_count):
+            try:
+                page = doc[page_num]
+                text = page.get_text()
+                
+                if text.strip():
+                    # 添加页面分隔符
+                    text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n")
+                    text_parts.append(text)
+                    
+                    logger.debug(f"第 {page_num + 1} 页提取了 {len(text)} 个字符")
+            
+            except Exception as e:
+                logger.warning(f"第 {page_num + 1} 页提取失败: {str(e)}")
+                continue
+        
+        # 合并文本
+        full_text = "".join(text_parts)
+        char_count = len(full_text)
+        
+        # 关闭文档
+        doc.close()
+        
+        # 检查是否提取到文本
+        has_text = char_count > 100  # 至少要有100个字符
+        
+        if not has_text:
+            logger.warning(f"PDF可能是扫描版或无文本内容")
+        
+        logger.info(f"PyMuPDF提取完成: 字符数={char_count}")
+        
+        return {
+            "success": True,
+            "method": "pymupdf",
+            "text": full_text,
+            "format": "plain_text",
+            "metadata": {
+                "page_count": page_count,
+                "char_count": char_count,
+                "has_text": has_text
+            }
+        }
+    
+    except Exception as e:
+        logger.error(f"PyMuPDF提取失败: {str(e)}")
+        return {
+            "success": False,
+            "error": str(e),
+            "method": "pymupdf"
+        }
+
+
+def extract_pdf_with_layout(file_path: str) -> Dict[str, Any]:
+    """
+    使用PyMuPDF提取PDF文本（保留布局）
+    
+    Args:
+        file_path: PDF文件路径
+    
+    Returns:
+        提取结果
+    """
+    try:
+        logger.info(f"开始使用PyMuPDF提取（保留布局）: {file_path}")
+        
+        doc = fitz.open(file_path)
+        page_count = len(doc)
+        
+        text_parts = []
+        
+        for page_num in range(page_count):
+            try:
+                page = doc[page_num]
+                
+                # 使用dict模式提取，可以保留更多格式信息
+                blocks = page.get_text("dict")["blocks"]
+                
+                page_text = []
+                
+                for block in blocks:
+                    if block["type"] == 0:  # 文本块
+                        for line in block.get("lines", []):
+                            for span in line.get("spans", []):
+                                text = span.get("text", "")
+                                if text.strip():
+                                    page_text.append(text)
+                
+                if page_text:
+                    text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n")
+                    text_parts.append(" ".join(page_text))
+            
+            except Exception as e:
+                logger.warning(f"第 {page_num + 1} 页处理失败: {str(e)}")
+                continue
+        
+        full_text = "".join(text_parts)
+        doc.close()
+        
+        return {
+            "success": True,
+            "method": "pymupdf_layout",
+            "text": full_text,
+            "format": "plain_text",
+            "metadata": {
+                "page_count": page_count,
+                "char_count": len(full_text)
+            }
+        }
+    
+    except Exception as e:
+        logger.error(f"PyMuPDF布局提取失败: {str(e)}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+
+
+def get_pdf_metadata(file_path: str) -> Dict[str, Any]:
+    """
+    获取PDF元数据
+    
+    Args:
+        file_path: PDF文件路径
+    
+    Returns:
+        PDF元数据
+    """
+    try:
+        doc = fitz.open(file_path)
+        
+        metadata = {
+            "page_count": len(doc),
+            "metadata": doc.metadata,
+            "is_encrypted": doc.is_encrypted,
+            "is_pdf": doc.is_pdf
+        }
+        
+        doc.close()
+        return metadata
+    
+    except Exception as e:
+        logger.error(f"获取PDF元数据失败: {str(e)}")
+        return {}
+
+
+
+
+
+