feat: add extraction_service (PDF/Docx/Txt) and update .gitignore to exclude venv

2025-11-16 15:32:44 +08:00
parent 2a4f59b08b
commit 39eb62ee79
18 changed files with 2706 additions and 0 deletions
--- a/extraction_service/services/docx_extractor.py
+++ b/extraction_service/services/docx_extractor.py
@@ -0,0 +1,257 @@
+"""
+Docx文档提取服务
+
+使用Mammoth库提取Word文档文本
+支持.docx格式（不支持老版.doc）
+"""
+
+import mammoth
+from pathlib import Path
+from typing import Dict, Any
+from loguru import logger
+
+
+def extract_docx_mammoth(file_path: str) -> Dict[str, Any]:
+    """
+    使用Mammoth提取Docx文本
+    
+    Mammoth特点:
+    - 转换为纯文本或HTML
+    - 保留基本格式信息
+    - 处理表格、列表等结构
+    
+    Args:
+        file_path: Docx文件路径
+    
+    Returns:
+        {
+            "success": True,
+            "text": "提取的文本内容",
+            "format": "plain_text",
+            "metadata": {
+                "char_count": 字符数,
+                "has_tables": 是否包含表格,
+                "file_size": 文件大小
+            }
+        }
+    """
+    try:
+        file_path_obj = Path(file_path)
+        
+        # 验证文件存在
+        if not file_path_obj.exists():
+            return {
+                "success": False,
+                "error": f"文件不存在: {file_path}",
+                "text": "",
+                "metadata": {}
+            }
+        
+        # 验证文件格式
+        if file_path_obj.suffix.lower() != '.docx':
+            return {
+                "success": False,
+                "error": f"不支持的文件格式: {file_path_obj.suffix}，仅支持.docx",
+                "text": "",
+                "metadata": {}
+            }
+        
+        logger.info(f"开始提取Docx文件: {file_path_obj.name}")
+        
+        # 使用Mammoth提取纯文本
+        with open(file_path, "rb") as docx_file:
+            result = mammoth.extract_raw_text(docx_file)
+            text = result.value  # 提取的文本
+            messages = result.messages  # 警告/错误信息
+        
+        # 检查是否有警告
+        if messages:
+            logger.warning(f"Mammoth提取警告: {len(messages)}个")
+            for msg in messages:
+                logger.debug(f"  - {msg.type}: {msg.message}")
+        
+        # 简单的质量检查
+        char_count = len(text)
+        if char_count == 0:
+            logger.warning("提取的文本为空")
+            return {
+                "success": False,
+                "error": "文档内容为空或无法提取",
+                "text": "",
+                "metadata": {
+                    "char_count": 0,
+                    "file_size": file_path_obj.stat().st_size
+                }
+            }
+        
+        # 简单判断是否包含表格（通过制表符或特殊结构）
+        has_tables = '\t' in text or '|' in text
+        
+        logger.info(f"Docx提取成功: {char_count}个字符")
+        
+        return {
+            "success": True,
+            "text": text,
+            "format": "plain_text",
+            "metadata": {
+                "char_count": char_count,
+                "has_tables": has_tables,
+                "file_size": file_path_obj.stat().st_size,
+                "warnings": len(messages)
+            }
+        }
+    
+    except Exception as e:
+        logger.error(f"Docx提取失败: {str(e)}")
+        return {
+            "success": False,
+            "error": str(e),
+            "text": "",
+            "metadata": {}
+        }
+
+
+def extract_docx_html(file_path: str) -> Dict[str, Any]:
+    """
+    使用Mammoth提取Docx为HTML格式（保留更多格式）
+    
+    Args:
+        file_path: Docx文件路径
+    
+    Returns:
+        {
+            "success": True,
+            "html": "HTML格式的文本",
+            "format": "html",
+            "metadata": {...}
+        }
+    """
+    try:
+        file_path_obj = Path(file_path)
+        
+        if not file_path_obj.exists():
+            return {
+                "success": False,
+                "error": f"文件不存在: {file_path}",
+                "html": "",
+                "metadata": {}
+            }
+        
+        logger.info(f"开始提取Docx为HTML: {file_path_obj.name}")
+        
+        # 提取为HTML
+        with open(file_path, "rb") as docx_file:
+            result = mammoth.convert_to_html(docx_file)
+            html = result.value
+            messages = result.messages
+        
+        if messages:
+            logger.warning(f"HTML转换警告: {len(messages)}个")
+        
+        logger.info(f"HTML提取成功: {len(html)}个字符")
+        
+        return {
+            "success": True,
+            "html": html,
+            "format": "html",
+            "metadata": {
+                "html_length": len(html),
+                "file_size": file_path_obj.stat().st_size,
+                "warnings": len(messages)
+            }
+        }
+    
+    except Exception as e:
+        logger.error(f"HTML提取失败: {str(e)}")
+        return {
+            "success": False,
+            "error": str(e),
+            "html": "",
+            "metadata": {}
+        }
+
+
+def validate_docx_file(file_path: str) -> Dict[str, Any]:
+    """
+    验证Docx文件的有效性
+    
+    Args:
+        file_path: 文件路径
+    
+    Returns:
+        {
+            "valid": True/False,
+            "reason": "原因",
+            "file_info": {文件信息}
+        }
+    """
+    try:
+        file_path_obj = Path(file_path)
+        
+        # 检查文件存在
+        if not file_path_obj.exists():
+            return {
+                "valid": False,
+                "reason": "文件不存在"
+            }
+        
+        # 检查后缀
+        if file_path_obj.suffix.lower() != '.docx':
+            return {
+                "valid": False,
+                "reason": f"不支持的格式: {file_path_obj.suffix}（仅支持.docx）"
+            }
+        
+        # 检查文件大小（限制50MB）
+        file_size = file_path_obj.stat().st_size
+        max_size = 50 * 1024 * 1024  # 50MB
+        
+        if file_size > max_size:
+            return {
+                "valid": False,
+                "reason": f"文件过大: {file_size / 1024 / 1024:.2f}MB（限制50MB）"
+            }
+        
+        if file_size == 0:
+            return {
+                "valid": False,
+                "reason": "文件为空"
+            }
+        
+        # 尝试打开文件（基本有效性检查）
+        try:
+            with open(file_path, "rb") as f:
+                # 读取前4个字节检查ZIP签名（docx本质是ZIP文件）
+                signature = f.read(4)
+                if signature != b'PK\x03\x04':
+                    return {
+                        "valid": False,
+                        "reason": "不是有效的Docx文件（ZIP签名错误）"
+                    }
+        except Exception as e:
+            return {
+                "valid": False,
+                "reason": f"无法读取文件: {str(e)}"
+            }
+        
+        return {
+            "valid": True,
+            "reason": "文件有效",
+            "file_info": {
+                "filename": file_path_obj.name,
+                "size": file_size,
+                "size_mb": round(file_size / 1024 / 1024, 2)
+            }
+        }
+    
+    except Exception as e:
+        return {
+            "valid": False,
+            "reason": f"验证失败: {str(e)}"
+        }
+
+
+
+
+
+