""" 语言检测服务 检测PDF文档的主要语言(中文/英文/混合) 用于决定使用哪种提取方法 """ import pdfplumber from typing import Dict, Any from loguru import logger def detect_language(pdf_path: str) -> str: """ 检测PDF主要语言 策略: 1. 提取前3页文本(代表性强) 2. 统计中文字符比例 3. 判断语言类型 Args: pdf_path: PDF文件路径 Returns: 'chinese' | 'english' | 'mixed' """ try: logger.info(f"开始语言检测: {pdf_path}") with pdfplumber.open(pdf_path) as pdf: # 提取前3页文本(或全部如果少于3页) sample_pages = min(3, len(pdf.pages)) sample_text = "" for i in range(sample_pages): try: page_text = pdf.pages[i].extract_text() if page_text: sample_text += page_text + "\n" except Exception as e: logger.warning(f"第{i+1}页文本提取失败: {str(e)}") continue # 检查是否有足够文本 if len(sample_text.strip()) < 100: logger.warning("文本太少,默认使用英文处理") return 'english' # 统计中文字符比例 chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff']) total_chars = len([c for c in sample_text if c.strip()]) if total_chars == 0: logger.warning("无有效字符,默认使用英文处理") return 'english' chinese_ratio = chinese_chars / total_chars logger.info(f"中文字符比例: {chinese_ratio:.2%} ({chinese_chars}/{total_chars})") # 判断语言 # 阈值说明: # - > 30%: 判定为中文PDF(包括中英混合但中文为主) # - <= 30%: 判定为英文PDF if chinese_ratio > 0.3: language = 'chinese' else: language = 'english' logger.info(f"检测结果: {language}") return language except Exception as e: logger.error(f"语言检测失败: {str(e)},默认使用英文处理") return 'english' def detect_language_detailed(pdf_path: str) -> Dict[str, Any]: """ 详细的语言检测 返回更多统计信息 Args: pdf_path: PDF文件路径 Returns: { "language": "chinese" | "english" | "mixed", "chinese_ratio": 0.65, "chinese_chars": 3500, "total_chars": 5000, "sample_pages": 3, "sample_text_length": 5000 } """ try: with pdfplumber.open(pdf_path) as pdf: sample_pages = min(3, len(pdf.pages)) sample_text = "" for i in range(sample_pages): try: page_text = pdf.pages[i].extract_text() if page_text: sample_text += page_text + "\n" except: continue # 统计 chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff']) total_chars = len([c for c in sample_text if c.strip()]) chinese_ratio = chinese_chars / total_chars if total_chars > 0 else 0 # 判断语言 if chinese_ratio > 0.3: language = 'chinese' elif chinese_ratio > 0.1: language = 'mixed' else: language = 'english' return { "language": language, "chinese_ratio": round(chinese_ratio, 4), "chinese_chars": chinese_chars, "total_chars": total_chars, "sample_pages": sample_pages, "sample_text_length": len(sample_text) } except Exception as e: logger.error(f"详细语言检测失败: {str(e)}") return { "language": "english", "error": str(e) } def is_chinese_pdf(pdf_path: str, threshold: float = 0.3) -> bool: """ 简单判断是否为中文PDF Args: pdf_path: PDF文件路径 threshold: 中文字符比例阈值,默认30% Returns: True if 中文字符比例 > threshold """ language = detect_language(pdf_path) return language == 'chinese'