feat: add extraction_service (PDF/Docx/Txt) and update .gitignore to exclude venv
This commit is contained in:
191
extraction_service/services/pdf_extractor.py
Normal file
191
extraction_service/services/pdf_extractor.py
Normal file
@@ -0,0 +1,191 @@
|
||||
"""
|
||||
PDF文本提取服务
|
||||
|
||||
使用PyMuPDF (fitz)提取PDF文本内容
|
||||
"""
|
||||
|
||||
import fitz # PyMuPDF
|
||||
from typing import Dict, Any
|
||||
from loguru import logger
|
||||
|
||||
|
||||
def extract_pdf_pymupdf(file_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
使用PyMuPDF提取PDF文本
|
||||
|
||||
Args:
|
||||
file_path: PDF文件路径
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": True,
|
||||
"method": "pymupdf",
|
||||
"text": "提取的文本",
|
||||
"metadata": {
|
||||
"page_count": 20,
|
||||
"char_count": 50000,
|
||||
"has_text": True
|
||||
}
|
||||
}
|
||||
"""
|
||||
try:
|
||||
logger.info(f"开始使用PyMuPDF提取: {file_path}")
|
||||
|
||||
# 打开PDF
|
||||
doc = fitz.open(file_path)
|
||||
page_count = len(doc)
|
||||
|
||||
logger.info(f"PDF页数: {page_count}")
|
||||
|
||||
# 提取所有页面的文本
|
||||
text_parts = []
|
||||
|
||||
for page_num in range(page_count):
|
||||
try:
|
||||
page = doc[page_num]
|
||||
text = page.get_text()
|
||||
|
||||
if text.strip():
|
||||
# 添加页面分隔符
|
||||
text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n")
|
||||
text_parts.append(text)
|
||||
|
||||
logger.debug(f"第 {page_num + 1} 页提取了 {len(text)} 个字符")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"第 {page_num + 1} 页提取失败: {str(e)}")
|
||||
continue
|
||||
|
||||
# 合并文本
|
||||
full_text = "".join(text_parts)
|
||||
char_count = len(full_text)
|
||||
|
||||
# 关闭文档
|
||||
doc.close()
|
||||
|
||||
# 检查是否提取到文本
|
||||
has_text = char_count > 100 # 至少要有100个字符
|
||||
|
||||
if not has_text:
|
||||
logger.warning(f"PDF可能是扫描版或无文本内容")
|
||||
|
||||
logger.info(f"PyMuPDF提取完成: 字符数={char_count}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"method": "pymupdf",
|
||||
"text": full_text,
|
||||
"format": "plain_text",
|
||||
"metadata": {
|
||||
"page_count": page_count,
|
||||
"char_count": char_count,
|
||||
"has_text": has_text
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"PyMuPDF提取失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"method": "pymupdf"
|
||||
}
|
||||
|
||||
|
||||
def extract_pdf_with_layout(file_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
使用PyMuPDF提取PDF文本(保留布局)
|
||||
|
||||
Args:
|
||||
file_path: PDF文件路径
|
||||
|
||||
Returns:
|
||||
提取结果
|
||||
"""
|
||||
try:
|
||||
logger.info(f"开始使用PyMuPDF提取(保留布局): {file_path}")
|
||||
|
||||
doc = fitz.open(file_path)
|
||||
page_count = len(doc)
|
||||
|
||||
text_parts = []
|
||||
|
||||
for page_num in range(page_count):
|
||||
try:
|
||||
page = doc[page_num]
|
||||
|
||||
# 使用dict模式提取,可以保留更多格式信息
|
||||
blocks = page.get_text("dict")["blocks"]
|
||||
|
||||
page_text = []
|
||||
|
||||
for block in blocks:
|
||||
if block["type"] == 0: # 文本块
|
||||
for line in block.get("lines", []):
|
||||
for span in line.get("spans", []):
|
||||
text = span.get("text", "")
|
||||
if text.strip():
|
||||
page_text.append(text)
|
||||
|
||||
if page_text:
|
||||
text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n")
|
||||
text_parts.append(" ".join(page_text))
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"第 {page_num + 1} 页处理失败: {str(e)}")
|
||||
continue
|
||||
|
||||
full_text = "".join(text_parts)
|
||||
doc.close()
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"method": "pymupdf_layout",
|
||||
"text": full_text,
|
||||
"format": "plain_text",
|
||||
"metadata": {
|
||||
"page_count": page_count,
|
||||
"char_count": len(full_text)
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"PyMuPDF布局提取失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
def get_pdf_metadata(file_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
获取PDF元数据
|
||||
|
||||
Args:
|
||||
file_path: PDF文件路径
|
||||
|
||||
Returns:
|
||||
PDF元数据
|
||||
"""
|
||||
try:
|
||||
doc = fitz.open(file_path)
|
||||
|
||||
metadata = {
|
||||
"page_count": len(doc),
|
||||
"metadata": doc.metadata,
|
||||
"is_encrypted": doc.is_encrypted,
|
||||
"is_pdf": doc.is_pdf
|
||||
}
|
||||
|
||||
doc.close()
|
||||
return metadata
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取PDF元数据失败: {str(e)}")
|
||||
return {}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user