161 lines
4.4 KiB
Python
161 lines
4.4 KiB
Python
"""
|
||
语言检测服务
|
||
|
||
检测PDF文档的主要语言(中文/英文/混合)
|
||
用于决定使用哪种提取方法
|
||
"""
|
||
|
||
import pdfplumber
|
||
from typing import Dict, Any
|
||
from loguru import logger
|
||
|
||
|
||
def detect_language(pdf_path: str) -> str:
|
||
"""
|
||
检测PDF主要语言
|
||
|
||
策略:
|
||
1. 提取前3页文本(代表性强)
|
||
2. 统计中文字符比例
|
||
3. 判断语言类型
|
||
|
||
Args:
|
||
pdf_path: PDF文件路径
|
||
|
||
Returns:
|
||
'chinese' | 'english' | 'mixed'
|
||
"""
|
||
try:
|
||
logger.info(f"开始语言检测: {pdf_path}")
|
||
|
||
with pdfplumber.open(pdf_path) as pdf:
|
||
# 提取前3页文本(或全部如果少于3页)
|
||
sample_pages = min(3, len(pdf.pages))
|
||
sample_text = ""
|
||
|
||
for i in range(sample_pages):
|
||
try:
|
||
page_text = pdf.pages[i].extract_text()
|
||
if page_text:
|
||
sample_text += page_text + "\n"
|
||
except Exception as e:
|
||
logger.warning(f"第{i+1}页文本提取失败: {str(e)}")
|
||
continue
|
||
|
||
# 检查是否有足够文本
|
||
if len(sample_text.strip()) < 100:
|
||
logger.warning("文本太少,默认使用英文处理")
|
||
return 'english'
|
||
|
||
# 统计中文字符比例
|
||
chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff'])
|
||
total_chars = len([c for c in sample_text if c.strip()])
|
||
|
||
if total_chars == 0:
|
||
logger.warning("无有效字符,默认使用英文处理")
|
||
return 'english'
|
||
|
||
chinese_ratio = chinese_chars / total_chars
|
||
|
||
logger.info(f"中文字符比例: {chinese_ratio:.2%} ({chinese_chars}/{total_chars})")
|
||
|
||
# 判断语言
|
||
# 阈值说明:
|
||
# - > 30%: 判定为中文PDF(包括中英混合但中文为主)
|
||
# - <= 30%: 判定为英文PDF
|
||
if chinese_ratio > 0.3:
|
||
language = 'chinese'
|
||
else:
|
||
language = 'english'
|
||
|
||
logger.info(f"检测结果: {language}")
|
||
return language
|
||
|
||
except Exception as e:
|
||
logger.error(f"语言检测失败: {str(e)},默认使用英文处理")
|
||
return 'english'
|
||
|
||
|
||
def detect_language_detailed(pdf_path: str) -> Dict[str, Any]:
|
||
"""
|
||
详细的语言检测
|
||
|
||
返回更多统计信息
|
||
|
||
Args:
|
||
pdf_path: PDF文件路径
|
||
|
||
Returns:
|
||
{
|
||
"language": "chinese" | "english" | "mixed",
|
||
"chinese_ratio": 0.65,
|
||
"chinese_chars": 3500,
|
||
"total_chars": 5000,
|
||
"sample_pages": 3,
|
||
"sample_text_length": 5000
|
||
}
|
||
"""
|
||
try:
|
||
with pdfplumber.open(pdf_path) as pdf:
|
||
sample_pages = min(3, len(pdf.pages))
|
||
sample_text = ""
|
||
|
||
for i in range(sample_pages):
|
||
try:
|
||
page_text = pdf.pages[i].extract_text()
|
||
if page_text:
|
||
sample_text += page_text + "\n"
|
||
except:
|
||
continue
|
||
|
||
# 统计
|
||
chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff'])
|
||
total_chars = len([c for c in sample_text if c.strip()])
|
||
|
||
chinese_ratio = chinese_chars / total_chars if total_chars > 0 else 0
|
||
|
||
# 判断语言
|
||
if chinese_ratio > 0.3:
|
||
language = 'chinese'
|
||
elif chinese_ratio > 0.1:
|
||
language = 'mixed'
|
||
else:
|
||
language = 'english'
|
||
|
||
return {
|
||
"language": language,
|
||
"chinese_ratio": round(chinese_ratio, 4),
|
||
"chinese_chars": chinese_chars,
|
||
"total_chars": total_chars,
|
||
"sample_pages": sample_pages,
|
||
"sample_text_length": len(sample_text)
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error(f"详细语言检测失败: {str(e)}")
|
||
return {
|
||
"language": "english",
|
||
"error": str(e)
|
||
}
|
||
|
||
|
||
def is_chinese_pdf(pdf_path: str, threshold: float = 0.3) -> bool:
|
||
"""
|
||
简单判断是否为中文PDF
|
||
|
||
Args:
|
||
pdf_path: PDF文件路径
|
||
threshold: 中文字符比例阈值,默认30%
|
||
|
||
Returns:
|
||
True if 中文字符比例 > threshold
|
||
"""
|
||
language = detect_language(pdf_path)
|
||
return language == 'chinese'
|
||
|
||
|
||
|
||
|
||
|
||
|