""" PDF文本提取服务 使用PyMuPDF (fitz)提取PDF文本内容 """ import fitz # PyMuPDF from typing import Dict, Any from loguru import logger def extract_pdf_pymupdf(file_path: str) -> Dict[str, Any]: """ 使用PyMuPDF提取PDF文本 Args: file_path: PDF文件路径 Returns: { "success": True, "method": "pymupdf", "text": "提取的文本", "metadata": { "page_count": 20, "char_count": 50000, "has_text": True } } """ try: logger.info(f"开始使用PyMuPDF提取: {file_path}") # 打开PDF doc = fitz.open(file_path) page_count = len(doc) logger.info(f"PDF页数: {page_count}") # 提取所有页面的文本 text_parts = [] for page_num in range(page_count): try: page = doc[page_num] text = page.get_text() if text.strip(): # 添加页面分隔符 text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n") text_parts.append(text) logger.debug(f"第 {page_num + 1} 页提取了 {len(text)} 个字符") except Exception as e: logger.warning(f"第 {page_num + 1} 页提取失败: {str(e)}") continue # 合并文本 full_text = "".join(text_parts) char_count = len(full_text) # 关闭文档 doc.close() # 检查是否提取到文本 has_text = char_count > 100 # 至少要有100个字符 if not has_text: logger.warning(f"PDF可能是扫描版或无文本内容") logger.info(f"PyMuPDF提取完成: 字符数={char_count}") return { "success": True, "method": "pymupdf", "text": full_text, "format": "plain_text", "metadata": { "page_count": page_count, "char_count": char_count, "has_text": has_text } } except Exception as e: logger.error(f"PyMuPDF提取失败: {str(e)}") return { "success": False, "error": str(e), "method": "pymupdf" } def extract_pdf_with_layout(file_path: str) -> Dict[str, Any]: """ 使用PyMuPDF提取PDF文本(保留布局) Args: file_path: PDF文件路径 Returns: 提取结果 """ try: logger.info(f"开始使用PyMuPDF提取(保留布局): {file_path}") doc = fitz.open(file_path) page_count = len(doc) text_parts = [] for page_num in range(page_count): try: page = doc[page_num] # 使用dict模式提取,可以保留更多格式信息 blocks = page.get_text("dict")["blocks"] page_text = [] for block in blocks: if block["type"] == 0: # 文本块 for line in block.get("lines", []): for span in line.get("spans", []): text = span.get("text", "") if text.strip(): page_text.append(text) if page_text: text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n") text_parts.append(" ".join(page_text)) except Exception as e: logger.warning(f"第 {page_num + 1} 页处理失败: {str(e)}") continue full_text = "".join(text_parts) doc.close() return { "success": True, "method": "pymupdf_layout", "text": full_text, "format": "plain_text", "metadata": { "page_count": page_count, "char_count": len(full_text) } } except Exception as e: logger.error(f"PyMuPDF布局提取失败: {str(e)}") return { "success": False, "error": str(e) } def get_pdf_metadata(file_path: str) -> Dict[str, Any]: """ 获取PDF元数据 Args: file_path: PDF文件路径 Returns: PDF元数据 """ try: doc = fitz.open(file_path) metadata = { "page_count": len(doc), "metadata": doc.metadata, "is_encrypted": doc.is_encrypted, "is_pdf": doc.is_pdf } doc.close() return metadata except Exception as e: logger.error(f"获取PDF元数据失败: {str(e)}") return {}