feat: add extraction_service (PDF/Docx/Txt) and update .gitignore to exclude venv
This commit is contained in:
160
extraction_service/services/language_detector.py
Normal file
160
extraction_service/services/language_detector.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""
|
||||
语言检测服务
|
||||
|
||||
检测PDF文档的主要语言(中文/英文/混合)
|
||||
用于决定使用哪种提取方法
|
||||
"""
|
||||
|
||||
import pdfplumber
|
||||
from typing import Dict, Any
|
||||
from loguru import logger
|
||||
|
||||
|
||||
def detect_language(pdf_path: str) -> str:
|
||||
"""
|
||||
检测PDF主要语言
|
||||
|
||||
策略:
|
||||
1. 提取前3页文本(代表性强)
|
||||
2. 统计中文字符比例
|
||||
3. 判断语言类型
|
||||
|
||||
Args:
|
||||
pdf_path: PDF文件路径
|
||||
|
||||
Returns:
|
||||
'chinese' | 'english' | 'mixed'
|
||||
"""
|
||||
try:
|
||||
logger.info(f"开始语言检测: {pdf_path}")
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
# 提取前3页文本(或全部如果少于3页)
|
||||
sample_pages = min(3, len(pdf.pages))
|
||||
sample_text = ""
|
||||
|
||||
for i in range(sample_pages):
|
||||
try:
|
||||
page_text = pdf.pages[i].extract_text()
|
||||
if page_text:
|
||||
sample_text += page_text + "\n"
|
||||
except Exception as e:
|
||||
logger.warning(f"第{i+1}页文本提取失败: {str(e)}")
|
||||
continue
|
||||
|
||||
# 检查是否有足够文本
|
||||
if len(sample_text.strip()) < 100:
|
||||
logger.warning("文本太少,默认使用英文处理")
|
||||
return 'english'
|
||||
|
||||
# 统计中文字符比例
|
||||
chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff'])
|
||||
total_chars = len([c for c in sample_text if c.strip()])
|
||||
|
||||
if total_chars == 0:
|
||||
logger.warning("无有效字符,默认使用英文处理")
|
||||
return 'english'
|
||||
|
||||
chinese_ratio = chinese_chars / total_chars
|
||||
|
||||
logger.info(f"中文字符比例: {chinese_ratio:.2%} ({chinese_chars}/{total_chars})")
|
||||
|
||||
# 判断语言
|
||||
# 阈值说明:
|
||||
# - > 30%: 判定为中文PDF(包括中英混合但中文为主)
|
||||
# - <= 30%: 判定为英文PDF
|
||||
if chinese_ratio > 0.3:
|
||||
language = 'chinese'
|
||||
else:
|
||||
language = 'english'
|
||||
|
||||
logger.info(f"检测结果: {language}")
|
||||
return language
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"语言检测失败: {str(e)},默认使用英文处理")
|
||||
return 'english'
|
||||
|
||||
|
||||
def detect_language_detailed(pdf_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
详细的语言检测
|
||||
|
||||
返回更多统计信息
|
||||
|
||||
Args:
|
||||
pdf_path: PDF文件路径
|
||||
|
||||
Returns:
|
||||
{
|
||||
"language": "chinese" | "english" | "mixed",
|
||||
"chinese_ratio": 0.65,
|
||||
"chinese_chars": 3500,
|
||||
"total_chars": 5000,
|
||||
"sample_pages": 3,
|
||||
"sample_text_length": 5000
|
||||
}
|
||||
"""
|
||||
try:
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
sample_pages = min(3, len(pdf.pages))
|
||||
sample_text = ""
|
||||
|
||||
for i in range(sample_pages):
|
||||
try:
|
||||
page_text = pdf.pages[i].extract_text()
|
||||
if page_text:
|
||||
sample_text += page_text + "\n"
|
||||
except:
|
||||
continue
|
||||
|
||||
# 统计
|
||||
chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff'])
|
||||
total_chars = len([c for c in sample_text if c.strip()])
|
||||
|
||||
chinese_ratio = chinese_chars / total_chars if total_chars > 0 else 0
|
||||
|
||||
# 判断语言
|
||||
if chinese_ratio > 0.3:
|
||||
language = 'chinese'
|
||||
elif chinese_ratio > 0.1:
|
||||
language = 'mixed'
|
||||
else:
|
||||
language = 'english'
|
||||
|
||||
return {
|
||||
"language": language,
|
||||
"chinese_ratio": round(chinese_ratio, 4),
|
||||
"chinese_chars": chinese_chars,
|
||||
"total_chars": total_chars,
|
||||
"sample_pages": sample_pages,
|
||||
"sample_text_length": len(sample_text)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"详细语言检测失败: {str(e)}")
|
||||
return {
|
||||
"language": "english",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
def is_chinese_pdf(pdf_path: str, threshold: float = 0.3) -> bool:
|
||||
"""
|
||||
简单判断是否为中文PDF
|
||||
|
||||
Args:
|
||||
pdf_path: PDF文件路径
|
||||
threshold: 中文字符比例阈值,默认30%
|
||||
|
||||
Returns:
|
||||
True if 中文字符比例 > threshold
|
||||
"""
|
||||
language = detect_language(pdf_path)
|
||||
return language == 'chinese'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user