Files
AIclinicalresearch/extraction_service/services/pdf_extractor.py

192 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
PDF文本提取服务
使用PyMuPDF (fitz)提取PDF文本内容
"""
import fitz # PyMuPDF
from typing import Dict, Any
from loguru import logger
def extract_pdf_pymupdf(file_path: str) -> Dict[str, Any]:
"""
使用PyMuPDF提取PDF文本
Args:
file_path: PDF文件路径
Returns:
{
"success": True,
"method": "pymupdf",
"text": "提取的文本",
"metadata": {
"page_count": 20,
"char_count": 50000,
"has_text": True
}
}
"""
try:
logger.info(f"开始使用PyMuPDF提取: {file_path}")
# 打开PDF
doc = fitz.open(file_path)
page_count = len(doc)
logger.info(f"PDF页数: {page_count}")
# 提取所有页面的文本
text_parts = []
for page_num in range(page_count):
try:
page = doc[page_num]
text = page.get_text()
if text.strip():
# 添加页面分隔符
text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n")
text_parts.append(text)
logger.debug(f"{page_num + 1} 页提取了 {len(text)} 个字符")
except Exception as e:
logger.warning(f"{page_num + 1} 页提取失败: {str(e)}")
continue
# 合并文本
full_text = "".join(text_parts)
char_count = len(full_text)
# 关闭文档
doc.close()
# 检查是否提取到文本
has_text = char_count > 100 # 至少要有100个字符
if not has_text:
logger.warning(f"PDF可能是扫描版或无文本内容")
logger.info(f"PyMuPDF提取完成: 字符数={char_count}")
return {
"success": True,
"method": "pymupdf",
"text": full_text,
"format": "plain_text",
"metadata": {
"page_count": page_count,
"char_count": char_count,
"has_text": has_text
}
}
except Exception as e:
logger.error(f"PyMuPDF提取失败: {str(e)}")
return {
"success": False,
"error": str(e),
"method": "pymupdf"
}
def extract_pdf_with_layout(file_path: str) -> Dict[str, Any]:
"""
使用PyMuPDF提取PDF文本保留布局
Args:
file_path: PDF文件路径
Returns:
提取结果
"""
try:
logger.info(f"开始使用PyMuPDF提取保留布局: {file_path}")
doc = fitz.open(file_path)
page_count = len(doc)
text_parts = []
for page_num in range(page_count):
try:
page = doc[page_num]
# 使用dict模式提取可以保留更多格式信息
blocks = page.get_text("dict")["blocks"]
page_text = []
for block in blocks:
if block["type"] == 0: # 文本块
for line in block.get("lines", []):
for span in line.get("spans", []):
text = span.get("text", "")
if text.strip():
page_text.append(text)
if page_text:
text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n")
text_parts.append(" ".join(page_text))
except Exception as e:
logger.warning(f"{page_num + 1} 页处理失败: {str(e)}")
continue
full_text = "".join(text_parts)
doc.close()
return {
"success": True,
"method": "pymupdf_layout",
"text": full_text,
"format": "plain_text",
"metadata": {
"page_count": page_count,
"char_count": len(full_text)
}
}
except Exception as e:
logger.error(f"PyMuPDF布局提取失败: {str(e)}")
return {
"success": False,
"error": str(e)
}
def get_pdf_metadata(file_path: str) -> Dict[str, Any]:
"""
获取PDF元数据
Args:
file_path: PDF文件路径
Returns:
PDF元数据
"""
try:
doc = fitz.open(file_path)
metadata = {
"page_count": len(doc),
"metadata": doc.metadata,
"is_encrypted": doc.is_encrypted,
"is_pdf": doc.is_pdf
}
doc.close()
return metadata
except Exception as e:
logger.error(f"获取PDF元数据失败: {str(e)}")
return {}