feat: add extraction_service (PDF/Docx/Txt) and update .gitignore to exclude venv

This commit is contained in:
AI Clinical Dev Team
2025-11-16 15:32:44 +08:00
parent 2a4f59b08b
commit 39eb62ee79
18 changed files with 2706 additions and 0 deletions

View File

@@ -0,0 +1,191 @@
"""
PDF文本提取服务
使用PyMuPDF (fitz)提取PDF文本内容
"""
import fitz # PyMuPDF
from typing import Dict, Any
from loguru import logger
def extract_pdf_pymupdf(file_path: str) -> Dict[str, Any]:
"""
使用PyMuPDF提取PDF文本
Args:
file_path: PDF文件路径
Returns:
{
"success": True,
"method": "pymupdf",
"text": "提取的文本",
"metadata": {
"page_count": 20,
"char_count": 50000,
"has_text": True
}
}
"""
try:
logger.info(f"开始使用PyMuPDF提取: {file_path}")
# 打开PDF
doc = fitz.open(file_path)
page_count = len(doc)
logger.info(f"PDF页数: {page_count}")
# 提取所有页面的文本
text_parts = []
for page_num in range(page_count):
try:
page = doc[page_num]
text = page.get_text()
if text.strip():
# 添加页面分隔符
text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n")
text_parts.append(text)
logger.debug(f"{page_num + 1} 页提取了 {len(text)} 个字符")
except Exception as e:
logger.warning(f"{page_num + 1} 页提取失败: {str(e)}")
continue
# 合并文本
full_text = "".join(text_parts)
char_count = len(full_text)
# 关闭文档
doc.close()
# 检查是否提取到文本
has_text = char_count > 100 # 至少要有100个字符
if not has_text:
logger.warning(f"PDF可能是扫描版或无文本内容")
logger.info(f"PyMuPDF提取完成: 字符数={char_count}")
return {
"success": True,
"method": "pymupdf",
"text": full_text,
"format": "plain_text",
"metadata": {
"page_count": page_count,
"char_count": char_count,
"has_text": has_text
}
}
except Exception as e:
logger.error(f"PyMuPDF提取失败: {str(e)}")
return {
"success": False,
"error": str(e),
"method": "pymupdf"
}
def extract_pdf_with_layout(file_path: str) -> Dict[str, Any]:
"""
使用PyMuPDF提取PDF文本保留布局
Args:
file_path: PDF文件路径
Returns:
提取结果
"""
try:
logger.info(f"开始使用PyMuPDF提取保留布局: {file_path}")
doc = fitz.open(file_path)
page_count = len(doc)
text_parts = []
for page_num in range(page_count):
try:
page = doc[page_num]
# 使用dict模式提取可以保留更多格式信息
blocks = page.get_text("dict")["blocks"]
page_text = []
for block in blocks:
if block["type"] == 0: # 文本块
for line in block.get("lines", []):
for span in line.get("spans", []):
text = span.get("text", "")
if text.strip():
page_text.append(text)
if page_text:
text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n")
text_parts.append(" ".join(page_text))
except Exception as e:
logger.warning(f"{page_num + 1} 页处理失败: {str(e)}")
continue
full_text = "".join(text_parts)
doc.close()
return {
"success": True,
"method": "pymupdf_layout",
"text": full_text,
"format": "plain_text",
"metadata": {
"page_count": page_count,
"char_count": len(full_text)
}
}
except Exception as e:
logger.error(f"PyMuPDF布局提取失败: {str(e)}")
return {
"success": False,
"error": str(e)
}
def get_pdf_metadata(file_path: str) -> Dict[str, Any]:
"""
获取PDF元数据
Args:
file_path: PDF文件路径
Returns:
PDF元数据
"""
try:
doc = fitz.open(file_path)
metadata = {
"page_count": len(doc),
"metadata": doc.metadata,
"is_encrypted": doc.is_encrypted,
"is_pdf": doc.is_pdf
}
doc.close()
return metadata
except Exception as e:
logger.error(f"获取PDF元数据失败: {str(e)}")
return {}