Files
AIclinicalresearch/extraction_service/services/language_detector.py

161 lines
4.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
语言检测服务
检测PDF文档的主要语言中文/英文/混合)
用于决定使用哪种提取方法
"""
import pdfplumber
from typing import Dict, Any
from loguru import logger
def detect_language(pdf_path: str) -> str:
"""
检测PDF主要语言
策略:
1. 提取前3页文本代表性强
2. 统计中文字符比例
3. 判断语言类型
Args:
pdf_path: PDF文件路径
Returns:
'chinese' | 'english' | 'mixed'
"""
try:
logger.info(f"开始语言检测: {pdf_path}")
with pdfplumber.open(pdf_path) as pdf:
# 提取前3页文本或全部如果少于3页
sample_pages = min(3, len(pdf.pages))
sample_text = ""
for i in range(sample_pages):
try:
page_text = pdf.pages[i].extract_text()
if page_text:
sample_text += page_text + "\n"
except Exception as e:
logger.warning(f"{i+1}页文本提取失败: {str(e)}")
continue
# 检查是否有足够文本
if len(sample_text.strip()) < 100:
logger.warning("文本太少,默认使用英文处理")
return 'english'
# 统计中文字符比例
chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff'])
total_chars = len([c for c in sample_text if c.strip()])
if total_chars == 0:
logger.warning("无有效字符,默认使用英文处理")
return 'english'
chinese_ratio = chinese_chars / total_chars
logger.info(f"中文字符比例: {chinese_ratio:.2%} ({chinese_chars}/{total_chars})")
# 判断语言
# 阈值说明:
# - > 30%: 判定为中文PDF包括中英混合但中文为主
# - <= 30%: 判定为英文PDF
if chinese_ratio > 0.3:
language = 'chinese'
else:
language = 'english'
logger.info(f"检测结果: {language}")
return language
except Exception as e:
logger.error(f"语言检测失败: {str(e)},默认使用英文处理")
return 'english'
def detect_language_detailed(pdf_path: str) -> Dict[str, Any]:
"""
详细的语言检测
返回更多统计信息
Args:
pdf_path: PDF文件路径
Returns:
{
"language": "chinese" | "english" | "mixed",
"chinese_ratio": 0.65,
"chinese_chars": 3500,
"total_chars": 5000,
"sample_pages": 3,
"sample_text_length": 5000
}
"""
try:
with pdfplumber.open(pdf_path) as pdf:
sample_pages = min(3, len(pdf.pages))
sample_text = ""
for i in range(sample_pages):
try:
page_text = pdf.pages[i].extract_text()
if page_text:
sample_text += page_text + "\n"
except:
continue
# 统计
chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff'])
total_chars = len([c for c in sample_text if c.strip()])
chinese_ratio = chinese_chars / total_chars if total_chars > 0 else 0
# 判断语言
if chinese_ratio > 0.3:
language = 'chinese'
elif chinese_ratio > 0.1:
language = 'mixed'
else:
language = 'english'
return {
"language": language,
"chinese_ratio": round(chinese_ratio, 4),
"chinese_chars": chinese_chars,
"total_chars": total_chars,
"sample_pages": sample_pages,
"sample_text_length": len(sample_text)
}
except Exception as e:
logger.error(f"详细语言检测失败: {str(e)}")
return {
"language": "english",
"error": str(e)
}
def is_chinese_pdf(pdf_path: str, threshold: float = 0.3) -> bool:
"""
简单判断是否为中文PDF
Args:
pdf_path: PDF文件路径
threshold: 中文字符比例阈值默认30%
Returns:
True if 中文字符比例 > threshold
"""
language = detect_language(pdf_path)
return language == 'chinese'