feat: add extraction_service (PDF/Docx/Txt) and update .gitignore to exclude venv

This commit is contained in:
AI Clinical Dev Team
2025-11-16 15:32:44 +08:00
parent 2a4f59b08b
commit 39eb62ee79
18 changed files with 2706 additions and 0 deletions

View File

@@ -0,0 +1,160 @@
"""
语言检测服务
检测PDF文档的主要语言中文/英文/混合)
用于决定使用哪种提取方法
"""
import pdfplumber
from typing import Dict, Any
from loguru import logger
def detect_language(pdf_path: str) -> str:
"""
检测PDF主要语言
策略:
1. 提取前3页文本代表性强
2. 统计中文字符比例
3. 判断语言类型
Args:
pdf_path: PDF文件路径
Returns:
'chinese' | 'english' | 'mixed'
"""
try:
logger.info(f"开始语言检测: {pdf_path}")
with pdfplumber.open(pdf_path) as pdf:
# 提取前3页文本或全部如果少于3页
sample_pages = min(3, len(pdf.pages))
sample_text = ""
for i in range(sample_pages):
try:
page_text = pdf.pages[i].extract_text()
if page_text:
sample_text += page_text + "\n"
except Exception as e:
logger.warning(f"{i+1}页文本提取失败: {str(e)}")
continue
# 检查是否有足够文本
if len(sample_text.strip()) < 100:
logger.warning("文本太少,默认使用英文处理")
return 'english'
# 统计中文字符比例
chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff'])
total_chars = len([c for c in sample_text if c.strip()])
if total_chars == 0:
logger.warning("无有效字符,默认使用英文处理")
return 'english'
chinese_ratio = chinese_chars / total_chars
logger.info(f"中文字符比例: {chinese_ratio:.2%} ({chinese_chars}/{total_chars})")
# 判断语言
# 阈值说明:
# - > 30%: 判定为中文PDF包括中英混合但中文为主
# - <= 30%: 判定为英文PDF
if chinese_ratio > 0.3:
language = 'chinese'
else:
language = 'english'
logger.info(f"检测结果: {language}")
return language
except Exception as e:
logger.error(f"语言检测失败: {str(e)},默认使用英文处理")
return 'english'
def detect_language_detailed(pdf_path: str) -> Dict[str, Any]:
"""
详细的语言检测
返回更多统计信息
Args:
pdf_path: PDF文件路径
Returns:
{
"language": "chinese" | "english" | "mixed",
"chinese_ratio": 0.65,
"chinese_chars": 3500,
"total_chars": 5000,
"sample_pages": 3,
"sample_text_length": 5000
}
"""
try:
with pdfplumber.open(pdf_path) as pdf:
sample_pages = min(3, len(pdf.pages))
sample_text = ""
for i in range(sample_pages):
try:
page_text = pdf.pages[i].extract_text()
if page_text:
sample_text += page_text + "\n"
except:
continue
# 统计
chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff'])
total_chars = len([c for c in sample_text if c.strip()])
chinese_ratio = chinese_chars / total_chars if total_chars > 0 else 0
# 判断语言
if chinese_ratio > 0.3:
language = 'chinese'
elif chinese_ratio > 0.1:
language = 'mixed'
else:
language = 'english'
return {
"language": language,
"chinese_ratio": round(chinese_ratio, 4),
"chinese_chars": chinese_chars,
"total_chars": total_chars,
"sample_pages": sample_pages,
"sample_text_length": len(sample_text)
}
except Exception as e:
logger.error(f"详细语言检测失败: {str(e)}")
return {
"language": "english",
"error": str(e)
}
def is_chinese_pdf(pdf_path: str, threshold: float = 0.3) -> bool:
"""
简单判断是否为中文PDF
Args:
pdf_path: PDF文件路径
threshold: 中文字符比例阈值默认30%
Returns:
True if 中文字符比例 > threshold
"""
language = detect_language(pdf_path)
return language == 'chinese'