""" PDF处理主服务 实现顺序降级策略: 1. 检测语言 2. 中文PDF → PyMuPDF(快速) 3. 英文PDF → Nougat → 失败降级PyMuPDF """ from typing import Dict, Any, Optional from loguru import logger from .language_detector import detect_language from .nougat_extractor import extract_pdf_nougat, check_nougat_available from .pdf_extractor import extract_pdf_pymupdf def extract_pdf( file_path: str, force_method: Optional[str] = None ) -> Dict[str, Any]: """ PDF提取主函数(顺序降级策略) 处理流程: 1. 检测语言 2. 中文 → 直接PyMuPDF 3. 英文 → 尝试Nougat → 失败降级PyMuPDF Args: file_path: PDF文件路径 force_method: 强制使用的方法 ('nougat' | 'pymupdf') Returns: { "success": True, "method": "nougat" | "pymupdf", "reason": "chinese_pdf" | "english_pdf" | "nougat_failed" | "nougat_low_quality", "text": "提取的文本", "metadata": {...} } """ try: logger.info(f"开始处理PDF: {file_path}") # Step 1: 语言检测 logger.info("[Step 1] 检测PDF语言...") language = detect_language(file_path) logger.info(f"检测结果: {language}") # 如果强制指定方法 if force_method: logger.info(f"强制使用方法: {force_method}") if force_method == 'nougat': return extract_pdf_nougat(file_path) elif force_method == 'pymupdf': result = extract_pdf_pymupdf(file_path) result['reason'] = 'force_pymupdf' return result # Step 2: 中文PDF → 直接PyMuPDF if language == 'chinese': logger.info("[Step 2] 中文PDF,使用PyMuPDF快速处理") result = extract_pdf_pymupdf(file_path) if result['success']: result['reason'] = 'chinese_pdf' result['detected_language'] = language logger.info("✅ PyMuPDF处理成功(中文PDF)") return result else: logger.error("❌ PyMuPDF处理失败") return result # Step 3: 英文PDF → 尝试Nougat logger.info("[Step 3] 英文PDF,尝试Nougat高质量解析") # 检查Nougat是否可用 if not check_nougat_available(): logger.warning("⚠️ Nougat不可用,降级到PyMuPDF") result = extract_pdf_pymupdf(file_path) if result['success']: result['reason'] = 'nougat_unavailable' result['detected_language'] = language return result # 尝试Nougat try: nougat_result = extract_pdf_nougat(file_path) if not nougat_result['success']: logger.warning("⚠️ Nougat提取失败,降级到PyMuPDF") raise Exception(nougat_result.get('error', 'Nougat failed')) # 质量检查 quality_score = nougat_result['metadata'].get('quality_score', 0) logger.info(f"Nougat质量评分: {quality_score:.2f}") # 质量阈值:0.7 if quality_score >= 0.7: logger.info("✅ Nougat处理成功(质量合格)") nougat_result['reason'] = 'english_pdf_high_quality' nougat_result['detected_language'] = language return nougat_result else: logger.warning(f"⚠️ Nougat质量不足: {quality_score:.2f},降级到PyMuPDF") raise Exception(f"Quality too low: {quality_score}") except Exception as e: logger.warning(f"Nougat处理失败: {str(e)},降级到PyMuPDF") # Step 4: 降级到PyMuPDF logger.info("[Step 4] 降级使用PyMuPDF") result = extract_pdf_pymupdf(file_path) if result['success']: result['reason'] = 'nougat_failed_or_low_quality' result['detected_language'] = language result['fallback'] = True logger.info("✅ PyMuPDF处理成功(降级方案)") else: logger.error("❌ PyMuPDF处理也失败了") return result except Exception as e: logger.error(f"PDF处理完全失败: {str(e)}") return { "success": False, "error": str(e), "method": "unknown" } def get_pdf_processing_strategy(file_path: str) -> Dict[str, Any]: """ 获取PDF处理策略(不实际提取) 用于预览将使用哪种方法 Args: file_path: PDF文件路径 Returns: { "detected_language": "chinese" | "english", "recommended_method": "nougat" | "pymupdf", "reason": "...", "nougat_available": True | False } """ try: # 检测语言 language = detect_language(file_path) # 检查Nougat可用性 nougat_available = check_nougat_available() # 决定策略 if language == 'chinese': recommended_method = 'pymupdf' reason = '中文PDF,推荐使用PyMuPDF快速处理' elif nougat_available: recommended_method = 'nougat' reason = '英文PDF,推荐使用Nougat高质量解析' else: recommended_method = 'pymupdf' reason = 'Nougat不可用,使用PyMuPDF' return { "detected_language": language, "recommended_method": recommended_method, "reason": reason, "nougat_available": nougat_available } except Exception as e: logger.error(f"获取处理策略失败: {str(e)}") return { "error": str(e) }