192 lines
5.0 KiB
Python
192 lines
5.0 KiB
Python
"""
|
||
PDF文本提取服务
|
||
|
||
使用PyMuPDF (fitz)提取PDF文本内容
|
||
"""
|
||
|
||
import fitz # PyMuPDF
|
||
from typing import Dict, Any
|
||
from loguru import logger
|
||
|
||
|
||
def extract_pdf_pymupdf(file_path: str) -> Dict[str, Any]:
|
||
"""
|
||
使用PyMuPDF提取PDF文本
|
||
|
||
Args:
|
||
file_path: PDF文件路径
|
||
|
||
Returns:
|
||
{
|
||
"success": True,
|
||
"method": "pymupdf",
|
||
"text": "提取的文本",
|
||
"metadata": {
|
||
"page_count": 20,
|
||
"char_count": 50000,
|
||
"has_text": True
|
||
}
|
||
}
|
||
"""
|
||
try:
|
||
logger.info(f"开始使用PyMuPDF提取: {file_path}")
|
||
|
||
# 打开PDF
|
||
doc = fitz.open(file_path)
|
||
page_count = len(doc)
|
||
|
||
logger.info(f"PDF页数: {page_count}")
|
||
|
||
# 提取所有页面的文本
|
||
text_parts = []
|
||
|
||
for page_num in range(page_count):
|
||
try:
|
||
page = doc[page_num]
|
||
text = page.get_text()
|
||
|
||
if text.strip():
|
||
# 添加页面分隔符
|
||
text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n")
|
||
text_parts.append(text)
|
||
|
||
logger.debug(f"第 {page_num + 1} 页提取了 {len(text)} 个字符")
|
||
|
||
except Exception as e:
|
||
logger.warning(f"第 {page_num + 1} 页提取失败: {str(e)}")
|
||
continue
|
||
|
||
# 合并文本
|
||
full_text = "".join(text_parts)
|
||
char_count = len(full_text)
|
||
|
||
# 关闭文档
|
||
doc.close()
|
||
|
||
# 检查是否提取到文本
|
||
has_text = char_count > 100 # 至少要有100个字符
|
||
|
||
if not has_text:
|
||
logger.warning(f"PDF可能是扫描版或无文本内容")
|
||
|
||
logger.info(f"PyMuPDF提取完成: 字符数={char_count}")
|
||
|
||
return {
|
||
"success": True,
|
||
"method": "pymupdf",
|
||
"text": full_text,
|
||
"format": "plain_text",
|
||
"metadata": {
|
||
"page_count": page_count,
|
||
"char_count": char_count,
|
||
"has_text": has_text
|
||
}
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error(f"PyMuPDF提取失败: {str(e)}")
|
||
return {
|
||
"success": False,
|
||
"error": str(e),
|
||
"method": "pymupdf"
|
||
}
|
||
|
||
|
||
def extract_pdf_with_layout(file_path: str) -> Dict[str, Any]:
|
||
"""
|
||
使用PyMuPDF提取PDF文本(保留布局)
|
||
|
||
Args:
|
||
file_path: PDF文件路径
|
||
|
||
Returns:
|
||
提取结果
|
||
"""
|
||
try:
|
||
logger.info(f"开始使用PyMuPDF提取(保留布局): {file_path}")
|
||
|
||
doc = fitz.open(file_path)
|
||
page_count = len(doc)
|
||
|
||
text_parts = []
|
||
|
||
for page_num in range(page_count):
|
||
try:
|
||
page = doc[page_num]
|
||
|
||
# 使用dict模式提取,可以保留更多格式信息
|
||
blocks = page.get_text("dict")["blocks"]
|
||
|
||
page_text = []
|
||
|
||
for block in blocks:
|
||
if block["type"] == 0: # 文本块
|
||
for line in block.get("lines", []):
|
||
for span in line.get("spans", []):
|
||
text = span.get("text", "")
|
||
if text.strip():
|
||
page_text.append(text)
|
||
|
||
if page_text:
|
||
text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n")
|
||
text_parts.append(" ".join(page_text))
|
||
|
||
except Exception as e:
|
||
logger.warning(f"第 {page_num + 1} 页处理失败: {str(e)}")
|
||
continue
|
||
|
||
full_text = "".join(text_parts)
|
||
doc.close()
|
||
|
||
return {
|
||
"success": True,
|
||
"method": "pymupdf_layout",
|
||
"text": full_text,
|
||
"format": "plain_text",
|
||
"metadata": {
|
||
"page_count": page_count,
|
||
"char_count": len(full_text)
|
||
}
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error(f"PyMuPDF布局提取失败: {str(e)}")
|
||
return {
|
||
"success": False,
|
||
"error": str(e)
|
||
}
|
||
|
||
|
||
def get_pdf_metadata(file_path: str) -> Dict[str, Any]:
|
||
"""
|
||
获取PDF元数据
|
||
|
||
Args:
|
||
file_path: PDF文件路径
|
||
|
||
Returns:
|
||
PDF元数据
|
||
"""
|
||
try:
|
||
doc = fitz.open(file_path)
|
||
|
||
metadata = {
|
||
"page_count": len(doc),
|
||
"metadata": doc.metadata,
|
||
"is_encrypted": doc.is_encrypted,
|
||
"is_pdf": doc.is_pdf
|
||
}
|
||
|
||
doc.close()
|
||
return metadata
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取PDF元数据失败: {str(e)}")
|
||
return {}
|
||
|
||
|
||
|
||
|
||
|
||
|