feat: add extraction_service (PDF/Docx/Txt) and update .gitignore to exclude venv
This commit is contained in:
192
extraction_service/services/pdf_processor.py
Normal file
192
extraction_service/services/pdf_processor.py
Normal file
@@ -0,0 +1,192 @@
|
||||
"""
|
||||
PDF处理主服务
|
||||
|
||||
实现顺序降级策略:
|
||||
1. 检测语言
|
||||
2. 中文PDF → PyMuPDF(快速)
|
||||
3. 英文PDF → Nougat → 失败降级PyMuPDF
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, Optional
|
||||
from loguru import logger
|
||||
|
||||
from .language_detector import detect_language
|
||||
from .nougat_extractor import extract_pdf_nougat, check_nougat_available
|
||||
from .pdf_extractor import extract_pdf_pymupdf
|
||||
|
||||
|
||||
def extract_pdf(
|
||||
file_path: str,
|
||||
force_method: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
PDF提取主函数(顺序降级策略)
|
||||
|
||||
处理流程:
|
||||
1. 检测语言
|
||||
2. 中文 → 直接PyMuPDF
|
||||
3. 英文 → 尝试Nougat → 失败降级PyMuPDF
|
||||
|
||||
Args:
|
||||
file_path: PDF文件路径
|
||||
force_method: 强制使用的方法 ('nougat' | 'pymupdf')
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": True,
|
||||
"method": "nougat" | "pymupdf",
|
||||
"reason": "chinese_pdf" | "english_pdf" | "nougat_failed" | "nougat_low_quality",
|
||||
"text": "提取的文本",
|
||||
"metadata": {...}
|
||||
}
|
||||
"""
|
||||
try:
|
||||
logger.info(f"开始处理PDF: {file_path}")
|
||||
|
||||
# Step 1: 语言检测
|
||||
logger.info("[Step 1] 检测PDF语言...")
|
||||
language = detect_language(file_path)
|
||||
logger.info(f"检测结果: {language}")
|
||||
|
||||
# 如果强制指定方法
|
||||
if force_method:
|
||||
logger.info(f"强制使用方法: {force_method}")
|
||||
|
||||
if force_method == 'nougat':
|
||||
return extract_pdf_nougat(file_path)
|
||||
elif force_method == 'pymupdf':
|
||||
result = extract_pdf_pymupdf(file_path)
|
||||
result['reason'] = 'force_pymupdf'
|
||||
return result
|
||||
|
||||
# Step 2: 中文PDF → 直接PyMuPDF
|
||||
if language == 'chinese':
|
||||
logger.info("[Step 2] 中文PDF,使用PyMuPDF快速处理")
|
||||
|
||||
result = extract_pdf_pymupdf(file_path)
|
||||
|
||||
if result['success']:
|
||||
result['reason'] = 'chinese_pdf'
|
||||
result['detected_language'] = language
|
||||
logger.info("✅ PyMuPDF处理成功(中文PDF)")
|
||||
return result
|
||||
else:
|
||||
logger.error("❌ PyMuPDF处理失败")
|
||||
return result
|
||||
|
||||
# Step 3: 英文PDF → 尝试Nougat
|
||||
logger.info("[Step 3] 英文PDF,尝试Nougat高质量解析")
|
||||
|
||||
# 检查Nougat是否可用
|
||||
if not check_nougat_available():
|
||||
logger.warning("⚠️ Nougat不可用,降级到PyMuPDF")
|
||||
|
||||
result = extract_pdf_pymupdf(file_path)
|
||||
if result['success']:
|
||||
result['reason'] = 'nougat_unavailable'
|
||||
result['detected_language'] = language
|
||||
return result
|
||||
|
||||
# 尝试Nougat
|
||||
try:
|
||||
nougat_result = extract_pdf_nougat(file_path)
|
||||
|
||||
if not nougat_result['success']:
|
||||
logger.warning("⚠️ Nougat提取失败,降级到PyMuPDF")
|
||||
raise Exception(nougat_result.get('error', 'Nougat failed'))
|
||||
|
||||
# 质量检查
|
||||
quality_score = nougat_result['metadata'].get('quality_score', 0)
|
||||
|
||||
logger.info(f"Nougat质量评分: {quality_score:.2f}")
|
||||
|
||||
# 质量阈值:0.7
|
||||
if quality_score >= 0.7:
|
||||
logger.info("✅ Nougat处理成功(质量合格)")
|
||||
nougat_result['reason'] = 'english_pdf_high_quality'
|
||||
nougat_result['detected_language'] = language
|
||||
return nougat_result
|
||||
else:
|
||||
logger.warning(f"⚠️ Nougat质量不足: {quality_score:.2f},降级到PyMuPDF")
|
||||
raise Exception(f"Quality too low: {quality_score}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Nougat处理失败: {str(e)},降级到PyMuPDF")
|
||||
|
||||
# Step 4: 降级到PyMuPDF
|
||||
logger.info("[Step 4] 降级使用PyMuPDF")
|
||||
|
||||
result = extract_pdf_pymupdf(file_path)
|
||||
|
||||
if result['success']:
|
||||
result['reason'] = 'nougat_failed_or_low_quality'
|
||||
result['detected_language'] = language
|
||||
result['fallback'] = True
|
||||
logger.info("✅ PyMuPDF处理成功(降级方案)")
|
||||
else:
|
||||
logger.error("❌ PyMuPDF处理也失败了")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"PDF处理完全失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"method": "unknown"
|
||||
}
|
||||
|
||||
|
||||
def get_pdf_processing_strategy(file_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
获取PDF处理策略(不实际提取)
|
||||
|
||||
用于预览将使用哪种方法
|
||||
|
||||
Args:
|
||||
file_path: PDF文件路径
|
||||
|
||||
Returns:
|
||||
{
|
||||
"detected_language": "chinese" | "english",
|
||||
"recommended_method": "nougat" | "pymupdf",
|
||||
"reason": "...",
|
||||
"nougat_available": True | False
|
||||
}
|
||||
"""
|
||||
try:
|
||||
# 检测语言
|
||||
language = detect_language(file_path)
|
||||
|
||||
# 检查Nougat可用性
|
||||
nougat_available = check_nougat_available()
|
||||
|
||||
# 决定策略
|
||||
if language == 'chinese':
|
||||
recommended_method = 'pymupdf'
|
||||
reason = '中文PDF,推荐使用PyMuPDF快速处理'
|
||||
elif nougat_available:
|
||||
recommended_method = 'nougat'
|
||||
reason = '英文PDF,推荐使用Nougat高质量解析'
|
||||
else:
|
||||
recommended_method = 'pymupdf'
|
||||
reason = 'Nougat不可用,使用PyMuPDF'
|
||||
|
||||
return {
|
||||
"detected_language": language,
|
||||
"recommended_method": recommended_method,
|
||||
"reason": reason,
|
||||
"nougat_available": nougat_available
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取处理策略失败: {str(e)}")
|
||||
return {
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user