feat: add extraction_service (PDF/Docx/Txt) and update .gitignore to exclude venv

This commit is contained in:
AI Clinical Dev Team
2025-11-16 15:32:44 +08:00
parent 2a4f59b08b
commit 39eb62ee79
18 changed files with 2706 additions and 0 deletions

72
.gitignore vendored
View File

@@ -48,6 +48,78 @@ tmp/
temp/
*.tmp
# ==================== Python ====================
# Virtual environments (重要!避免提交 2+ GB 的依赖)
venv/
env/
.venv/
ENV/
env.bak/
venv.bak/
# Python cache
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
# PyInstaller
*.manifest
*.spec
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# Celery
celerybeat-schedule
celerybeat.pid
# Environments
.env
.venv
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# Unit test / coverage
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre
.pyre/
# pytype
.pytype/
# Cython
cython_debug/

40
extraction_service/.gitignore vendored Normal file
View File

@@ -0,0 +1,40 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
env/
venv/
ENV/
build/
dist/
*.egg-info/
# 环境变量
.env
# IDE
.vscode/
.idea/
*.swp
*.swo
# 临时文件
/tmp/
*.log
# 测试
.pytest_cache/
.coverage
htmlcov/
# OS
.DS_Store
Thumbs.db

View File

@@ -0,0 +1,181 @@
# 文档提取微服务
基于FastAPI的文档文本提取服务支持PDF、Docx、Txt格式。
## 功能特性
-**PDF提取**使用PyMuPDF快速提取PDF文本
-**Docx提取**使用Mammoth提取Word文档Day 3
-**Txt提取**支持多种编码Day 3
-**语言检测**自动检测PDF语言Day 2
-**Nougat集成**高质量学术PDF解析Day 2
## 快速开始
### 1. 安装依赖
```bash
cd extraction_service
# 创建虚拟环境(推荐)
python -m venv venv
source venv/bin/activate # Windows: venv\Scripts\activate
# 安装依赖
pip install -r requirements.txt
```
### 2. 配置环境变量
```bash
# 复制示例配置
cp .env.example .env
# 编辑配置(可选)
# SERVICE_PORT=8000
# DEBUG=True
```
### 3. 启动服务
```bash
# 开发模式(自动重载)
python main.py
# 或使用uvicorn
uvicorn main:app --reload --port 8000
```
服务将在 http://localhost:8000 启动
### 4. 测试服务
#### 健康检查
```bash
curl http://localhost:8000/api/health
```
返回:
```json
{
"status": "healthy",
"checks": {
"pymupdf": {
"available": true,
"version": "1.23.8"
},
"temp_dir": {
"path": "/tmp/extraction_service",
"writable": true
}
}
}
```
#### PDF文本提取
```bash
curl -X POST http://localhost:8000/api/extract/pdf \
-F "file=@test.pdf"
```
返回:
```json
{
"success": true,
"method": "pymupdf",
"text": "提取的文本内容...",
"metadata": {
"page_count": 20,
"char_count": 50000,
"file_size": 1024000,
"filename": "test.pdf"
}
}
```
## API文档
启动服务后访问:
- Swagger UI: http://localhost:8000/docs
- ReDoc: http://localhost:8000/redoc
## 项目结构
```
extraction_service/
├── main.py # 主应用入口
├── requirements.txt # Python依赖
├── .env.example # 环境变量示例
├── README.md # 本文件
├── services/ # 服务模块
│ ├── __init__.py
│ ├── pdf_extractor.py # PDF提取PyMuPDF
│ ├── nougat_extractor.py # Nougat提取Day 2
│ ├── docx_extractor.py # Docx提取Day 3
│ ├── txt_extractor.py # Txt提取Day 3
│ ├── language_detector.py # 语言检测Day 2
│ └── file_utils.py # 文件工具
└── tests/ # 测试文件(待添加)
```
## 开发计划
### ✅ Day 1已完成
- [x] FastAPI项目搭建
- [x] PyMuPDF集成
- [x] PDF文本提取功能
- [x] 健康检查API
### ⏳ Day 2进行中
- [ ] 安装Nougat
- [ ] 语言检测功能
- [ ] Nougat提取逻辑
- [ ] 顺序降级机制
### ⏳ Day 3
- [ ] Docx提取Mammoth
- [ ] Txt提取多编码
- [ ] 文件格式验证
## 依赖说明
| 库 | 版本 | 用途 |
|---|---|---|
| fastapi | 0.104.1 | Web框架 |
| uvicorn | 0.24.0 | ASGI服务器 |
| PyMuPDF | 1.23.8 | PDF文本提取 |
| pdfplumber | 0.10.3 | PDF语言检测 |
| mammoth | 1.6.0 | Docx提取 |
| langdetect | 1.0.9 | 语言检测 |
| loguru | 0.7.2 | 日志管理 |
## 性能指标
| 操作 | 目标时间 |
|---|---|
| 20页PDFPyMuPDF | <30秒 |
| 10页Docx | <10秒 |
| 1MB Txt | <5秒 |
## 常见问题
### Q: PyMuPDF安装失败
A: 确保Python版本>=3.8使用pip安装`pip install PyMuPDF`
### Q: 服务无法启动?
A: 检查端口8000是否被占用可修改.env中的SERVICE_PORT
### Q: 临时文件在哪里?
A: 默认在/tmp/extraction_service目录可通过TEMP_DIR环境变量配置
## License
MIT

View File

@@ -0,0 +1,89 @@
@echo off
chcp 65001 >nul
echo ================================
echo 安装文档提取微服务依赖
echo ================================
echo.
REM 检查Python
echo [1/5] 检查Python环境...
python --version >nul 2>&1
if errorlevel 1 (
echo ❌ 错误: 未找到Python
echo 请先安装Python 3.8或更高版本
echo 下载地址: https://www.python.org/downloads/
pause
exit /b 1
)
python --version
echo ✅ Python已安装
echo.
REM 创建虚拟环境
echo [2/5] 创建虚拟环境...
if exist venv (
echo 虚拟环境已存在,跳过创建
) else (
python -m venv venv
if errorlevel 1 (
echo ❌ 创建虚拟环境失败
pause
exit /b 1
)
echo ✅ 虚拟环境创建成功
)
echo.
REM 激活虚拟环境
echo [3/5] 激活虚拟环境...
call venv\Scripts\activate
if errorlevel 1 (
echo ❌ 激活虚拟环境失败
pause
exit /b 1
)
echo ✅ 虚拟环境已激活
echo.
REM 升级pip
echo [4/5] 升级pip...
python -m pip install --upgrade pip
echo.
REM 安装依赖
echo [5/5] 安装依赖包...
echo 这可能需要几分钟时间...
pip install -r requirements.txt
if errorlevel 1 (
echo ❌ 依赖安装失败
pause
exit /b 1
)
echo.
REM 验证安装
echo ================================
echo 验证安装
echo ================================
python -c "import fastapi; print('✅ FastAPI:', fastapi.__version__)"
python -c "import fitz; print('✅ PyMuPDF:', fitz.__version__)"
python -c "import uvicorn; print('✅ Uvicorn: OK')"
echo.
echo ================================
echo 🎉 安装完成!
echo ================================
echo.
echo 下一步:
echo 1. 启动服务: start.bat
echo 2. 测试服务: python test_service.py
echo.
pause

View File

@@ -0,0 +1,88 @@
@echo off
chcp 65001 >nul
echo ================================
echo 安装Nougat OCR
echo ================================
echo.
echo ⚠️ 注意事项:
echo 1. Nougat需要Python 3.8+
echo 2. 首次运行会下载模型文件约350MB
echo 3. 建议使用GPU加速需CUDA
echo 4. 安装可能需要5-10分钟
echo.
pause
REM 激活虚拟环境
if exist venv\Scripts\activate.bat (
echo [1/4] 激活虚拟环境...
call venv\Scripts\activate
) else (
echo 错误: 请先运行 install.bat 创建虚拟环境
pause
exit /b 1
)
REM 安装Nougat
echo.
echo [2/4] 安装Nougat OCR...
echo 这可能需要几分钟时间...
echo.
pip install nougat-ocr==0.1.17
if errorlevel 1 (
echo.
echo ❌ Nougat安装失败
echo.
echo 可能的原因:
echo 1. 网络问题:请使用国内镜像源
echo 2. Python版本需要Python 3.8+
echo 3. 依赖冲突:可能需要新的虚拟环境
echo.
echo 替代方案:
echo - 如果只使用中文PDF可以不安装Nougat
echo - 系统会自动降级使用PyMuPDF
echo.
pause
exit /b 1
)
echo.
echo [3/4] 验证安装...
python -c "import nougat; print('✅ Nougat导入成功')"
echo.
echo [4/4] 测试Nougat命令...
nougat --version
if errorlevel 1 (
echo ⚠️ 命令行工具未找到但Python模块已安装
echo 这可能不影响使用系统会尝试直接调用Python模块
) else (
echo ✅ Nougat命令行工具正常
)
echo.
echo ================================
echo 🎉 Nougat安装完成
echo ================================
echo.
echo 说明:
echo - Nougat擅长处理英文学术PDF
echo - 能保留表格、公式等结构
echo - 中文PDF会自动使用PyMuPDF
echo - 首次使用会下载模型约350MB
echo.
echo 下一步:
echo - 启动服务: start.bat
echo - 健康检查: curl http://localhost:8000/api/health
echo.
pause

508
extraction_service/main.py Normal file
View File

@@ -0,0 +1,508 @@
"""
文档提取微服务 - 主入口
功能:
- PDF文本提取PyMuPDF
- Docx文本提取Mammoth
- Txt文本提取直接读取
- 语言检测
- 健康检查
"""
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from loguru import logger
from pathlib import Path
import os
import sys
from datetime import datetime
from dotenv import load_dotenv
# 加载环境变量
load_dotenv()
# 配置日志
logger.remove()
logger.add(
sys.stdout,
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
level=os.getenv("LOG_LEVEL", "INFO")
)
# 创建FastAPI应用
app = FastAPI(
title="文档提取微服务",
description="提供PDF、Docx、Txt文档的文本提取服务",
version="1.0.0",
)
# CORS配置
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # 生产环境应该限制具体域名
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# 临时文件目录
TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service"))
TEMP_DIR.mkdir(parents=True, exist_ok=True)
# 导入服务模块
from services.pdf_extractor import extract_pdf_pymupdf
from services.pdf_processor import extract_pdf, get_pdf_processing_strategy
from services.language_detector import detect_language, detect_language_detailed
from services.nougat_extractor import check_nougat_available, get_nougat_info
from services.file_utils import detect_file_type, cleanup_temp_file
from services.docx_extractor import extract_docx_mammoth, validate_docx_file
from services.txt_extractor import extract_txt, validate_txt_file
# ==================== API路由 ====================
@app.get("/")
async def root():
"""根路径"""
return {
"service": "文档提取微服务",
"version": "1.0.0",
"status": "running"
}
@app.get("/api/health")
async def health_check():
"""
健康检查接口
检查项:
- 服务是否运行
- PyMuPDF是否可用
- Nougat是否可用
- 临时目录是否可写
"""
try:
import fitz # PyMuPDF
pymupdf_version = fitz.__version__
pymupdf_available = True
except Exception as e:
pymupdf_version = "unknown"
pymupdf_available = False
logger.warning(f"PyMuPDF不可用: {str(e)}")
# 检查Nougat
nougat_info = get_nougat_info()
# 检查临时目录
temp_dir_writable = TEMP_DIR.exists() and os.access(TEMP_DIR, os.W_OK)
return {
"status": "healthy" if (pymupdf_available and temp_dir_writable) else "degraded",
"checks": {
"pymupdf": {
"available": pymupdf_available,
"version": pymupdf_version
},
"nougat": nougat_info,
"temp_dir": {
"path": str(TEMP_DIR),
"writable": temp_dir_writable
}
},
"timestamp": datetime.now().isoformat()
}
@app.post("/api/extract/pdf")
async def extract_pdf_endpoint(
file: UploadFile = File(...),
method: str = "auto"
):
"""
PDF文本提取接口智能选择方法
Args:
file: 上传的PDF文件
method: 提取方法 ('auto' | 'nougat' | 'pymupdf')
- auto: 自动选择(默认)
- nougat: 强制使用Nougat
- pymupdf: 强制使用PyMuPDF
Returns:
{
"success": true,
"method": "nougat" | "pymupdf",
"reason": "...",
"text": "提取的文本内容",
"metadata": {...}
}
"""
temp_path = None
try:
# 验证文件类型
if not file.filename.lower().endswith('.pdf'):
raise HTTPException(
status_code=400,
detail="文件格式错误只支持PDF文件"
)
# 保存临时文件
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
logger.info(f"开始处理PDF文件: {file.filename}, 方法={method}")
with open(temp_path, "wb") as f:
content = await file.read()
f.write(content)
file_size = len(content)
logger.info(f"文件大小: {file_size / 1024:.2f} KB")
# 提取文本(使用顺序降级策略)
force_method = None if method == "auto" else method
result = extract_pdf(str(temp_path), force_method=force_method)
if not result["success"]:
raise HTTPException(
status_code=500,
detail=f"PDF提取失败: {result.get('error', 'Unknown error')}"
)
# 添加文件元数据
result["metadata"]["file_size"] = file_size
result["metadata"]["filename"] = file.filename
logger.info(f"PDF提取成功: {file.filename}, "
f"方法={result['method']}, "
f"原因={result.get('reason', 'N/A')}")
return JSONResponse(content=result)
except HTTPException:
raise
except Exception as e:
logger.error(f"PDF提取失败: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"处理失败: {str(e)}"
)
finally:
# 清理临时文件
if temp_path:
cleanup_temp_file(temp_path)
@app.post("/api/detect-language")
async def detect_language_endpoint(file: UploadFile = File(...)):
"""
PDF语言检测接口
Args:
file: 上传的PDF文件
Returns:
{
"language": "chinese" | "english" | "mixed",
"chinese_ratio": 0.65,
"chinese_chars": 3500,
"total_chars": 5000
}
"""
temp_path = None
try:
if not file.filename.lower().endswith('.pdf'):
raise HTTPException(status_code=400, detail="只支持PDF文件")
# 保存临时文件
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
with open(temp_path, "wb") as f:
content = await file.read()
f.write(content)
# 检测语言
result = detect_language_detailed(str(temp_path))
result["filename"] = file.filename
return JSONResponse(content=result)
except HTTPException:
raise
except Exception as e:
logger.error(f"语言检测失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"检测失败: {str(e)}")
finally:
if temp_path:
cleanup_temp_file(temp_path)
@app.post("/api/pdf-strategy")
async def get_strategy_endpoint(file: UploadFile = File(...)):
"""
获取PDF处理策略不实际提取
Args:
file: 上传的PDF文件
Returns:
{
"detected_language": "chinese" | "english",
"recommended_method": "nougat" | "pymupdf",
"reason": "...",
"nougat_available": true
}
"""
temp_path = None
try:
if not file.filename.lower().endswith('.pdf'):
raise HTTPException(status_code=400, detail="只支持PDF文件")
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
with open(temp_path, "wb") as f:
content = await file.read()
f.write(content)
# 获取处理策略
result = get_pdf_processing_strategy(str(temp_path))
result["filename"] = file.filename
return JSONResponse(content=result)
except HTTPException:
raise
except Exception as e:
logger.error(f"获取策略失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"失败: {str(e)}")
finally:
if temp_path:
cleanup_temp_file(temp_path)
@app.post("/api/extract/docx")
async def extract_docx_endpoint(file: UploadFile = File(...)):
"""
Docx文档提取接口
Args:
file: 上传的Docx文件
Returns:
{
"success": true,
"method": "mammoth",
"text": "提取的文本内容",
"metadata": {
"char_count": 字符数,
"has_tables": 是否包含表格,
"file_size": 文件大小
}
}
"""
temp_path = None
try:
# 验证文件类型
if not file.filename.lower().endswith('.docx'):
raise HTTPException(
status_code=400,
detail="文件格式错误只支持Docx文件"
)
# 保存临时文件
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
logger.info(f"开始处理Docx文件: {file.filename}")
with open(temp_path, "wb") as f:
content = await file.read()
f.write(content)
file_size = len(content)
logger.info(f"文件大小: {file_size / 1024:.2f} KB")
# 提取文本
result = extract_docx_mammoth(str(temp_path))
if not result["success"]:
raise HTTPException(
status_code=500,
detail=f"Docx提取失败: {result.get('error', 'Unknown error')}"
)
# 添加文件元数据
result["method"] = "mammoth"
result["metadata"]["filename"] = file.filename
logger.info(f"Docx提取成功: {file.filename}, "
f"字符数={result['metadata']['char_count']}")
return JSONResponse(content=result)
except HTTPException:
raise
except Exception as e:
logger.error(f"Docx提取失败: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"处理失败: {str(e)}"
)
finally:
if temp_path:
cleanup_temp_file(temp_path)
@app.post("/api/extract/txt")
async def extract_txt_endpoint(file: UploadFile = File(...)):
"""
Txt文本文件提取接口
Args:
file: 上传的Txt文件
Returns:
{
"success": true,
"method": "direct",
"text": "文本内容",
"encoding": "utf-8",
"metadata": {
"char_count": 字符数,
"line_count": 行数,
"file_size": 文件大小
}
}
"""
temp_path = None
try:
# 验证文件类型
if not file.filename.lower().endswith('.txt'):
raise HTTPException(
status_code=400,
detail="文件格式错误只支持Txt文件"
)
# 保存临时文件
temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
logger.info(f"开始处理Txt文件: {file.filename}")
with open(temp_path, "wb") as f:
content = await file.read()
f.write(content)
file_size = len(content)
logger.info(f"文件大小: {file_size / 1024:.2f} KB")
# 提取文本
result = extract_txt(str(temp_path))
if not result["success"]:
raise HTTPException(
status_code=500,
detail=f"Txt提取失败: {result.get('error', 'Unknown error')}"
)
# 添加方法标识和文件名
result["method"] = "direct"
result["metadata"]["filename"] = file.filename
logger.info(f"Txt提取成功: {file.filename}, "
f"编码={result['encoding']}, "
f"字符数={result['metadata']['char_count']}")
return JSONResponse(content=result)
except HTTPException:
raise
except Exception as e:
logger.error(f"Txt提取失败: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"处理失败: {str(e)}"
)
finally:
if temp_path:
cleanup_temp_file(temp_path)
@app.post("/api/extract")
async def extract_document(
file: UploadFile = File(...),
file_type: str = None
):
"""
通用文档提取接口
自动检测文件类型并调用相应的提取方法
Args:
file: 上传的文件
file_type: 可选,指定文件类型 ('pdf' | 'docx' | 'txt')
Returns:
提取结果
"""
try:
# 自动检测文件类型
if not file_type:
file_type = detect_file_type(file.filename)
logger.info(f"文件类型: {file_type}, 文件名: {file.filename}")
# 根据类型调用不同的处理函数
if file_type == 'pdf':
return await extract_pdf_endpoint(file)
elif file_type == 'docx':
return await extract_docx_endpoint(file)
elif file_type == 'txt':
return await extract_txt_endpoint(file)
else:
raise HTTPException(
status_code=400,
detail=f"不支持的文件格式: {file_type}仅支持PDF、Docx、Txt"
)
except HTTPException:
raise
except Exception as e:
logger.error(f"文档提取失败: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"处理失败: {str(e)}"
)
# ==================== 启动配置 ====================
if __name__ == "__main__":
import uvicorn
port = int(os.getenv("SERVICE_PORT", 8000))
host = os.getenv("SERVICE_HOST", "0.0.0.0")
debug = os.getenv("DEBUG", "True").lower() == "true"
logger.info(f"启动文档提取微服务...")
logger.info(f"地址: http://{host}:{port}")
logger.info(f"健康检查: http://{host}:{port}/api/health")
logger.info(f"调试模式: {debug}")
uvicorn.run(
"main:app",
host=host,
port=port,
reload=debug,
log_level="info"
)

View File

@@ -0,0 +1,31 @@
# FastAPI核心依赖
fastapi==0.104.1
uvicorn[standard]==0.24.0
python-multipart==0.0.6
# PDF处理
PyMuPDF>=1.24.0 # 使用更新版本有预编译wheel
pdfplumber==0.10.3
nougat-ocr==0.1.17 # 学术PDF高质量提取英文
albumentations==1.3.1 # Nougat兼容版本不要升级到2.x
# Docx处理Day 3需要
mammoth==1.6.0
python-docx==1.1.0
# 语言检测Day 2需要
langdetect==1.0.9
# 编码检测Day 3需要
chardet==5.2.0
# 工具
python-dotenv==1.0.0
pydantic>=2.10.0 # 使用更新版本有预编译wheel
# 日志
loguru==0.7.2
# 测试工具
requests==2.31.0

View File

@@ -0,0 +1,11 @@
"""
服务模块
包含各种文档提取和处理服务
"""

View File

@@ -0,0 +1,257 @@
"""
Docx文档提取服务
使用Mammoth库提取Word文档文本
支持.docx格式不支持老版.doc
"""
import mammoth
from pathlib import Path
from typing import Dict, Any
from loguru import logger
def extract_docx_mammoth(file_path: str) -> Dict[str, Any]:
"""
使用Mammoth提取Docx文本
Mammoth特点:
- 转换为纯文本或HTML
- 保留基本格式信息
- 处理表格、列表等结构
Args:
file_path: Docx文件路径
Returns:
{
"success": True,
"text": "提取的文本内容",
"format": "plain_text",
"metadata": {
"char_count": 字符数,
"has_tables": 是否包含表格,
"file_size": 文件大小
}
}
"""
try:
file_path_obj = Path(file_path)
# 验证文件存在
if not file_path_obj.exists():
return {
"success": False,
"error": f"文件不存在: {file_path}",
"text": "",
"metadata": {}
}
# 验证文件格式
if file_path_obj.suffix.lower() != '.docx':
return {
"success": False,
"error": f"不支持的文件格式: {file_path_obj.suffix},仅支持.docx",
"text": "",
"metadata": {}
}
logger.info(f"开始提取Docx文件: {file_path_obj.name}")
# 使用Mammoth提取纯文本
with open(file_path, "rb") as docx_file:
result = mammoth.extract_raw_text(docx_file)
text = result.value # 提取的文本
messages = result.messages # 警告/错误信息
# 检查是否有警告
if messages:
logger.warning(f"Mammoth提取警告: {len(messages)}")
for msg in messages:
logger.debug(f" - {msg.type}: {msg.message}")
# 简单的质量检查
char_count = len(text)
if char_count == 0:
logger.warning("提取的文本为空")
return {
"success": False,
"error": "文档内容为空或无法提取",
"text": "",
"metadata": {
"char_count": 0,
"file_size": file_path_obj.stat().st_size
}
}
# 简单判断是否包含表格(通过制表符或特殊结构)
has_tables = '\t' in text or '|' in text
logger.info(f"Docx提取成功: {char_count}个字符")
return {
"success": True,
"text": text,
"format": "plain_text",
"metadata": {
"char_count": char_count,
"has_tables": has_tables,
"file_size": file_path_obj.stat().st_size,
"warnings": len(messages)
}
}
except Exception as e:
logger.error(f"Docx提取失败: {str(e)}")
return {
"success": False,
"error": str(e),
"text": "",
"metadata": {}
}
def extract_docx_html(file_path: str) -> Dict[str, Any]:
"""
使用Mammoth提取Docx为HTML格式保留更多格式
Args:
file_path: Docx文件路径
Returns:
{
"success": True,
"html": "HTML格式的文本",
"format": "html",
"metadata": {...}
}
"""
try:
file_path_obj = Path(file_path)
if not file_path_obj.exists():
return {
"success": False,
"error": f"文件不存在: {file_path}",
"html": "",
"metadata": {}
}
logger.info(f"开始提取Docx为HTML: {file_path_obj.name}")
# 提取为HTML
with open(file_path, "rb") as docx_file:
result = mammoth.convert_to_html(docx_file)
html = result.value
messages = result.messages
if messages:
logger.warning(f"HTML转换警告: {len(messages)}")
logger.info(f"HTML提取成功: {len(html)}个字符")
return {
"success": True,
"html": html,
"format": "html",
"metadata": {
"html_length": len(html),
"file_size": file_path_obj.stat().st_size,
"warnings": len(messages)
}
}
except Exception as e:
logger.error(f"HTML提取失败: {str(e)}")
return {
"success": False,
"error": str(e),
"html": "",
"metadata": {}
}
def validate_docx_file(file_path: str) -> Dict[str, Any]:
"""
验证Docx文件的有效性
Args:
file_path: 文件路径
Returns:
{
"valid": True/False,
"reason": "原因",
"file_info": {文件信息}
}
"""
try:
file_path_obj = Path(file_path)
# 检查文件存在
if not file_path_obj.exists():
return {
"valid": False,
"reason": "文件不存在"
}
# 检查后缀
if file_path_obj.suffix.lower() != '.docx':
return {
"valid": False,
"reason": f"不支持的格式: {file_path_obj.suffix}(仅支持.docx"
}
# 检查文件大小限制50MB
file_size = file_path_obj.stat().st_size
max_size = 50 * 1024 * 1024 # 50MB
if file_size > max_size:
return {
"valid": False,
"reason": f"文件过大: {file_size / 1024 / 1024:.2f}MB限制50MB"
}
if file_size == 0:
return {
"valid": False,
"reason": "文件为空"
}
# 尝试打开文件(基本有效性检查)
try:
with open(file_path, "rb") as f:
# 读取前4个字节检查ZIP签名docx本质是ZIP文件
signature = f.read(4)
if signature != b'PK\x03\x04':
return {
"valid": False,
"reason": "不是有效的Docx文件ZIP签名错误"
}
except Exception as e:
return {
"valid": False,
"reason": f"无法读取文件: {str(e)}"
}
return {
"valid": True,
"reason": "文件有效",
"file_info": {
"filename": file_path_obj.name,
"size": file_size,
"size_mb": round(file_size / 1024 / 1024, 2)
}
}
except Exception as e:
return {
"valid": False,
"reason": f"验证失败: {str(e)}"
}

View File

@@ -0,0 +1,88 @@
"""
文件工具函数
"""
import os
from pathlib import Path
from loguru import logger
def detect_file_type(filename: str) -> str:
"""
根据文件名检测文件类型
Args:
filename: 文件名
Returns:
文件类型: 'pdf' | 'docx' | 'txt'
Raises:
ValueError: 不支持的文件格式
"""
ext = filename.lower().split('.')[-1]
if ext == 'pdf':
return 'pdf'
elif ext == 'docx':
return 'docx'
elif ext == 'txt':
return 'txt'
else:
raise ValueError(f"不支持的文件格式: .{ext}")
def cleanup_temp_file(file_path: Path | str) -> None:
"""
清理临时文件
Args:
file_path: 文件路径
"""
try:
if isinstance(file_path, str):
file_path = Path(file_path)
if file_path.exists():
file_path.unlink()
logger.debug(f"清理临时文件: {file_path}")
except Exception as e:
logger.warning(f"清理临时文件失败: {str(e)}")
def get_file_size_mb(file_path: Path | str) -> float:
"""
获取文件大小MB
Args:
file_path: 文件路径
Returns:
文件大小MB
"""
if isinstance(file_path, str):
file_path = Path(file_path)
if file_path.exists():
return file_path.stat().st_size / (1024 * 1024)
return 0.0
def validate_file_size(file_size: int, max_size: int = 52428800) -> bool:
"""
验证文件大小
Args:
file_size: 文件大小(字节)
max_size: 最大允许大小字节默认50MB
Returns:
是否通过验证
"""
return file_size <= max_size

View File

@@ -0,0 +1,160 @@
"""
语言检测服务
检测PDF文档的主要语言中文/英文/混合)
用于决定使用哪种提取方法
"""
import pdfplumber
from typing import Dict, Any
from loguru import logger
def detect_language(pdf_path: str) -> str:
"""
检测PDF主要语言
策略:
1. 提取前3页文本代表性强
2. 统计中文字符比例
3. 判断语言类型
Args:
pdf_path: PDF文件路径
Returns:
'chinese' | 'english' | 'mixed'
"""
try:
logger.info(f"开始语言检测: {pdf_path}")
with pdfplumber.open(pdf_path) as pdf:
# 提取前3页文本或全部如果少于3页
sample_pages = min(3, len(pdf.pages))
sample_text = ""
for i in range(sample_pages):
try:
page_text = pdf.pages[i].extract_text()
if page_text:
sample_text += page_text + "\n"
except Exception as e:
logger.warning(f"{i+1}页文本提取失败: {str(e)}")
continue
# 检查是否有足够文本
if len(sample_text.strip()) < 100:
logger.warning("文本太少,默认使用英文处理")
return 'english'
# 统计中文字符比例
chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff'])
total_chars = len([c for c in sample_text if c.strip()])
if total_chars == 0:
logger.warning("无有效字符,默认使用英文处理")
return 'english'
chinese_ratio = chinese_chars / total_chars
logger.info(f"中文字符比例: {chinese_ratio:.2%} ({chinese_chars}/{total_chars})")
# 判断语言
# 阈值说明:
# - > 30%: 判定为中文PDF包括中英混合但中文为主
# - <= 30%: 判定为英文PDF
if chinese_ratio > 0.3:
language = 'chinese'
else:
language = 'english'
logger.info(f"检测结果: {language}")
return language
except Exception as e:
logger.error(f"语言检测失败: {str(e)},默认使用英文处理")
return 'english'
def detect_language_detailed(pdf_path: str) -> Dict[str, Any]:
"""
详细的语言检测
返回更多统计信息
Args:
pdf_path: PDF文件路径
Returns:
{
"language": "chinese" | "english" | "mixed",
"chinese_ratio": 0.65,
"chinese_chars": 3500,
"total_chars": 5000,
"sample_pages": 3,
"sample_text_length": 5000
}
"""
try:
with pdfplumber.open(pdf_path) as pdf:
sample_pages = min(3, len(pdf.pages))
sample_text = ""
for i in range(sample_pages):
try:
page_text = pdf.pages[i].extract_text()
if page_text:
sample_text += page_text + "\n"
except:
continue
# 统计
chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff'])
total_chars = len([c for c in sample_text if c.strip()])
chinese_ratio = chinese_chars / total_chars if total_chars > 0 else 0
# 判断语言
if chinese_ratio > 0.3:
language = 'chinese'
elif chinese_ratio > 0.1:
language = 'mixed'
else:
language = 'english'
return {
"language": language,
"chinese_ratio": round(chinese_ratio, 4),
"chinese_chars": chinese_chars,
"total_chars": total_chars,
"sample_pages": sample_pages,
"sample_text_length": len(sample_text)
}
except Exception as e:
logger.error(f"详细语言检测失败: {str(e)}")
return {
"language": "english",
"error": str(e)
}
def is_chinese_pdf(pdf_path: str, threshold: float = 0.3) -> bool:
"""
简单判断是否为中文PDF
Args:
pdf_path: PDF文件路径
threshold: 中文字符比例阈值默认30%
Returns:
True if 中文字符比例 > threshold
"""
language = detect_language(pdf_path)
return language == 'chinese'

View File

@@ -0,0 +1,241 @@
"""
Nougat提取服务
使用Nougat OCR提取学术PDF的高质量文本
保留表格、公式等结构信息
"""
import subprocess
import os
from pathlib import Path
from typing import Dict, Any, Optional, Callable
from loguru import logger
def check_nougat_available() -> bool:
"""
检查Nougat是否已安装
Returns:
True if Nougat可用
"""
try:
# 方法1: 尝试导入nougat模块
import nougat
logger.info(f"Nougat module is available (version: {getattr(nougat, '__version__', 'unknown')})")
return True
except ImportError:
logger.warning("Nougat module not found")
return False
except Exception as e:
logger.error(f"检查Nougat失败: {str(e)}")
return False
def extract_pdf_nougat(
file_path: str,
output_dir: Optional[str] = None,
progress_callback: Optional[Callable[[int, int], None]] = None
) -> Dict[str, Any]:
"""
使用Nougat提取PDF文本
Args:
file_path: PDF文件路径
output_dir: 输出目录,默认为临时目录
progress_callback: 进度回调函数 (current_page, total_pages)
Returns:
{
"success": True,
"method": "nougat",
"text": "提取的Markdown文本",
"format": "markdown",
"metadata": {
"page_count": 20,
"char_count": 50000,
"quality_score": 0.95,
"has_tables": True,
"has_formulas": True
}
}
"""
try:
# 检查Nougat是否可用
if not check_nougat_available():
raise Exception("Nougat未安装请先安装pip install nougat-ocr")
logger.info(f"开始使用Nougat提取: {file_path}")
# 准备输出目录
if output_dir is None:
output_dir = os.path.join(os.path.dirname(file_path), "nougat_output")
Path(output_dir).mkdir(parents=True, exist_ok=True)
# 构建Nougat命令
# nougat命令格式nougat <pdf_path> -o <output_dir>
cmd = [
'nougat',
file_path,
'-o', output_dir,
'--markdown', # 输出Markdown格式
'--no-skipping' # 不跳过任何页面
]
logger.info(f"执行命令: {' '.join(cmd)}")
# 执行Nougat
# 注意Nougat可能需要较长时间1-2分钟/20页
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
# 等待完成
stdout, stderr = process.communicate(timeout=300) # 5分钟超时
if process.returncode != 0:
logger.error(f"Nougat执行失败: {stderr}")
raise Exception(f"Nougat执行失败: {stderr}")
# 读取输出文件
# Nougat会生成 <filename>.mmd 文件
pdf_name = Path(file_path).stem
output_file = Path(output_dir) / f"{pdf_name}.mmd"
if not output_file.exists():
raise Exception(f"Nougat输出文件不存在: {output_file}")
with open(output_file, 'r', encoding='utf-8') as f:
markdown_text = f.read()
# 评估质量
quality_result = evaluate_nougat_quality(markdown_text)
logger.info(f"Nougat提取完成: 质量={quality_result['quality_score']:.2f}")
return {
"success": True,
"method": "nougat",
"text": markdown_text,
"format": "markdown",
"metadata": {
"char_count": len(markdown_text),
"quality_score": quality_result['quality_score'],
"has_tables": quality_result['has_tables'],
"has_formulas": quality_result['has_formulas'],
"has_structure": quality_result['has_structure']
}
}
except subprocess.TimeoutExpired:
logger.error("Nougat处理超时>5分钟")
return {
"success": False,
"error": "处理超时",
"method": "nougat"
}
except Exception as e:
logger.error(f"Nougat提取失败: {str(e)}")
return {
"success": False,
"error": str(e),
"method": "nougat"
}
def evaluate_nougat_quality(text: str) -> Dict[str, Any]:
"""
评估Nougat提取质量
评分标准:
- 基础分0.5
- 有章节结构:+0.2
- 有表格:+0.15
- 有公式:+0.15
- 文本长度充足:+0.1
- 乱码检测:-0.3
Args:
text: Nougat提取的Markdown文本
Returns:
{
"quality_score": 0.92,
"has_structure": True,
"has_tables": True,
"has_formulas": True,
"has_garbled": False
}
"""
score = 0.5 # 基础分
# 检查章节结构Markdown标题
has_structure = bool(text.count('##') >= 2 or text.count('#') >= 3)
if has_structure:
score += 0.2
# 检查表格
has_tables = '|' in text and '---' in text
if has_tables:
score += 0.15
# 检查公式LaTeX格式
has_formulas = '$$' in text or '$' in text or '\\(' in text
if has_formulas:
score += 0.15
# 检查文本长度
if len(text) > 5000: # 至少5000字符
score += 0.1
# 检查乱码(简单启发式)
# 大量重复字符或特殊符号可能表示乱码
garbled_chars = sum(1 for c in text if ord(c) > 65535 or c in '<EFBFBD><EFBFBD>')
has_garbled = garbled_chars > len(text) * 0.05 # 超过5%
if has_garbled:
score -= 0.3
# 确保分数在0-1之间
score = max(0.0, min(1.0, score))
return {
"quality_score": score,
"has_structure": has_structure,
"has_tables": has_tables,
"has_formulas": has_formulas,
"has_garbled": has_garbled
}
def get_nougat_info() -> Dict[str, Any]:
"""
获取Nougat信息
Returns:
Nougat版本和状态信息
"""
try:
import nougat
version = getattr(nougat, '__version__', 'unknown')
return {
"available": True,
"version": version
}
except ImportError:
return {
"available": False,
"error": "Nougat未安装"
}
except Exception as e:
return {
"available": False,
"error": str(e)
}

View File

@@ -0,0 +1,191 @@
"""
PDF文本提取服务
使用PyMuPDF (fitz)提取PDF文本内容
"""
import fitz # PyMuPDF
from typing import Dict, Any
from loguru import logger
def extract_pdf_pymupdf(file_path: str) -> Dict[str, Any]:
"""
使用PyMuPDF提取PDF文本
Args:
file_path: PDF文件路径
Returns:
{
"success": True,
"method": "pymupdf",
"text": "提取的文本",
"metadata": {
"page_count": 20,
"char_count": 50000,
"has_text": True
}
}
"""
try:
logger.info(f"开始使用PyMuPDF提取: {file_path}")
# 打开PDF
doc = fitz.open(file_path)
page_count = len(doc)
logger.info(f"PDF页数: {page_count}")
# 提取所有页面的文本
text_parts = []
for page_num in range(page_count):
try:
page = doc[page_num]
text = page.get_text()
if text.strip():
# 添加页面分隔符
text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n")
text_parts.append(text)
logger.debug(f"{page_num + 1} 页提取了 {len(text)} 个字符")
except Exception as e:
logger.warning(f"{page_num + 1} 页提取失败: {str(e)}")
continue
# 合并文本
full_text = "".join(text_parts)
char_count = len(full_text)
# 关闭文档
doc.close()
# 检查是否提取到文本
has_text = char_count > 100 # 至少要有100个字符
if not has_text:
logger.warning(f"PDF可能是扫描版或无文本内容")
logger.info(f"PyMuPDF提取完成: 字符数={char_count}")
return {
"success": True,
"method": "pymupdf",
"text": full_text,
"format": "plain_text",
"metadata": {
"page_count": page_count,
"char_count": char_count,
"has_text": has_text
}
}
except Exception as e:
logger.error(f"PyMuPDF提取失败: {str(e)}")
return {
"success": False,
"error": str(e),
"method": "pymupdf"
}
def extract_pdf_with_layout(file_path: str) -> Dict[str, Any]:
"""
使用PyMuPDF提取PDF文本保留布局
Args:
file_path: PDF文件路径
Returns:
提取结果
"""
try:
logger.info(f"开始使用PyMuPDF提取保留布局: {file_path}")
doc = fitz.open(file_path)
page_count = len(doc)
text_parts = []
for page_num in range(page_count):
try:
page = doc[page_num]
# 使用dict模式提取可以保留更多格式信息
blocks = page.get_text("dict")["blocks"]
page_text = []
for block in blocks:
if block["type"] == 0: # 文本块
for line in block.get("lines", []):
for span in line.get("spans", []):
text = span.get("text", "")
if text.strip():
page_text.append(text)
if page_text:
text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n")
text_parts.append(" ".join(page_text))
except Exception as e:
logger.warning(f"{page_num + 1} 页处理失败: {str(e)}")
continue
full_text = "".join(text_parts)
doc.close()
return {
"success": True,
"method": "pymupdf_layout",
"text": full_text,
"format": "plain_text",
"metadata": {
"page_count": page_count,
"char_count": len(full_text)
}
}
except Exception as e:
logger.error(f"PyMuPDF布局提取失败: {str(e)}")
return {
"success": False,
"error": str(e)
}
def get_pdf_metadata(file_path: str) -> Dict[str, Any]:
"""
获取PDF元数据
Args:
file_path: PDF文件路径
Returns:
PDF元数据
"""
try:
doc = fitz.open(file_path)
metadata = {
"page_count": len(doc),
"metadata": doc.metadata,
"is_encrypted": doc.is_encrypted,
"is_pdf": doc.is_pdf
}
doc.close()
return metadata
except Exception as e:
logger.error(f"获取PDF元数据失败: {str(e)}")
return {}

View File

@@ -0,0 +1,192 @@
"""
PDF处理主服务
实现顺序降级策略:
1. 检测语言
2. 中文PDF → PyMuPDF快速
3. 英文PDF → Nougat → 失败降级PyMuPDF
"""
from typing import Dict, Any, Optional
from loguru import logger
from .language_detector import detect_language
from .nougat_extractor import extract_pdf_nougat, check_nougat_available
from .pdf_extractor import extract_pdf_pymupdf
def extract_pdf(
file_path: str,
force_method: Optional[str] = None
) -> Dict[str, Any]:
"""
PDF提取主函数顺序降级策略
处理流程:
1. 检测语言
2. 中文 → 直接PyMuPDF
3. 英文 → 尝试Nougat → 失败降级PyMuPDF
Args:
file_path: PDF文件路径
force_method: 强制使用的方法 ('nougat' | 'pymupdf')
Returns:
{
"success": True,
"method": "nougat" | "pymupdf",
"reason": "chinese_pdf" | "english_pdf" | "nougat_failed" | "nougat_low_quality",
"text": "提取的文本",
"metadata": {...}
}
"""
try:
logger.info(f"开始处理PDF: {file_path}")
# Step 1: 语言检测
logger.info("[Step 1] 检测PDF语言...")
language = detect_language(file_path)
logger.info(f"检测结果: {language}")
# 如果强制指定方法
if force_method:
logger.info(f"强制使用方法: {force_method}")
if force_method == 'nougat':
return extract_pdf_nougat(file_path)
elif force_method == 'pymupdf':
result = extract_pdf_pymupdf(file_path)
result['reason'] = 'force_pymupdf'
return result
# Step 2: 中文PDF → 直接PyMuPDF
if language == 'chinese':
logger.info("[Step 2] 中文PDF使用PyMuPDF快速处理")
result = extract_pdf_pymupdf(file_path)
if result['success']:
result['reason'] = 'chinese_pdf'
result['detected_language'] = language
logger.info("✅ PyMuPDF处理成功中文PDF")
return result
else:
logger.error("❌ PyMuPDF处理失败")
return result
# Step 3: 英文PDF → 尝试Nougat
logger.info("[Step 3] 英文PDF尝试Nougat高质量解析")
# 检查Nougat是否可用
if not check_nougat_available():
logger.warning("⚠️ Nougat不可用降级到PyMuPDF")
result = extract_pdf_pymupdf(file_path)
if result['success']:
result['reason'] = 'nougat_unavailable'
result['detected_language'] = language
return result
# 尝试Nougat
try:
nougat_result = extract_pdf_nougat(file_path)
if not nougat_result['success']:
logger.warning("⚠️ Nougat提取失败降级到PyMuPDF")
raise Exception(nougat_result.get('error', 'Nougat failed'))
# 质量检查
quality_score = nougat_result['metadata'].get('quality_score', 0)
logger.info(f"Nougat质量评分: {quality_score:.2f}")
# 质量阈值0.7
if quality_score >= 0.7:
logger.info("✅ Nougat处理成功质量合格")
nougat_result['reason'] = 'english_pdf_high_quality'
nougat_result['detected_language'] = language
return nougat_result
else:
logger.warning(f"⚠️ Nougat质量不足: {quality_score:.2f}降级到PyMuPDF")
raise Exception(f"Quality too low: {quality_score}")
except Exception as e:
logger.warning(f"Nougat处理失败: {str(e)}降级到PyMuPDF")
# Step 4: 降级到PyMuPDF
logger.info("[Step 4] 降级使用PyMuPDF")
result = extract_pdf_pymupdf(file_path)
if result['success']:
result['reason'] = 'nougat_failed_or_low_quality'
result['detected_language'] = language
result['fallback'] = True
logger.info("✅ PyMuPDF处理成功降级方案")
else:
logger.error("❌ PyMuPDF处理也失败了")
return result
except Exception as e:
logger.error(f"PDF处理完全失败: {str(e)}")
return {
"success": False,
"error": str(e),
"method": "unknown"
}
def get_pdf_processing_strategy(file_path: str) -> Dict[str, Any]:
"""
获取PDF处理策略不实际提取
用于预览将使用哪种方法
Args:
file_path: PDF文件路径
Returns:
{
"detected_language": "chinese" | "english",
"recommended_method": "nougat" | "pymupdf",
"reason": "...",
"nougat_available": True | False
}
"""
try:
# 检测语言
language = detect_language(file_path)
# 检查Nougat可用性
nougat_available = check_nougat_available()
# 决定策略
if language == 'chinese':
recommended_method = 'pymupdf'
reason = '中文PDF推荐使用PyMuPDF快速处理'
elif nougat_available:
recommended_method = 'nougat'
reason = '英文PDF推荐使用Nougat高质量解析'
else:
recommended_method = 'pymupdf'
reason = 'Nougat不可用使用PyMuPDF'
return {
"detected_language": language,
"recommended_method": recommended_method,
"reason": reason,
"nougat_available": nougat_available
}
except Exception as e:
logger.error(f"获取处理策略失败: {str(e)}")
return {
"error": str(e)
}

View File

@@ -0,0 +1,320 @@
"""
Txt文本文件提取服务
直接读取纯文本文件
支持多种编码自动检测
"""
from pathlib import Path
from typing import Dict, Any, List
from loguru import logger
import chardet
def extract_txt(file_path: str) -> Dict[str, Any]:
"""
提取Txt文件内容
特性:
- 自动检测编码UTF-8, GBK, GB2312等
- 支持大文件(逐块读取)
- 去除BOM标记
Args:
file_path: Txt文件路径
Returns:
{
"success": True,
"text": "文本内容",
"encoding": "检测到的编码",
"metadata": {
"char_count": 字符数,
"line_count": 行数,
"file_size": 文件大小
}
}
"""
try:
file_path_obj = Path(file_path)
# 验证文件存在
if not file_path_obj.exists():
return {
"success": False,
"error": f"文件不存在: {file_path}",
"text": "",
"metadata": {}
}
# 验证文件格式
if file_path_obj.suffix.lower() != '.txt':
return {
"success": False,
"error": f"不支持的文件格式: {file_path_obj.suffix},仅支持.txt",
"text": "",
"metadata": {}
}
file_size = file_path_obj.stat().st_size
# 空文件检查
if file_size == 0:
return {
"success": False,
"error": "文件为空",
"text": "",
"metadata": {
"char_count": 0,
"line_count": 0,
"file_size": 0
}
}
logger.info(f"开始提取Txt文件: {file_path_obj.name} ({file_size / 1024:.2f} KB)")
# 检测编码
detected_encoding = detect_encoding(file_path)
logger.info(f"检测到编码: {detected_encoding}")
# 读取文件(带编码回退)
text, actual_encoding = read_with_fallback(file_path, detected_encoding)
if text is None:
return {
"success": False,
"error": "无法解码文件,尝试了多种编码均失败",
"text": "",
"metadata": {}
}
# 统计信息
char_count = len(text)
line_count = text.count('\n') + 1
logger.info(f"Txt提取成功: {char_count}个字符, {line_count}")
return {
"success": True,
"text": text,
"encoding": actual_encoding,
"metadata": {
"char_count": char_count,
"line_count": line_count,
"file_size": file_size,
"size_kb": round(file_size / 1024, 2)
}
}
except Exception as e:
logger.error(f"Txt提取失败: {str(e)}")
return {
"success": False,
"error": str(e),
"text": "",
"metadata": {}
}
def detect_encoding(file_path: str, sample_size: int = 10000) -> str:
"""
检测文件编码
Args:
file_path: 文件路径
sample_size: 采样大小(字节)
Returns:
检测到的编码名称
"""
try:
with open(file_path, 'rb') as f:
raw_data = f.read(sample_size)
# 使用chardet检测
result = chardet.detect(raw_data)
encoding = result['encoding']
confidence = result['confidence']
logger.debug(f"编码检测: {encoding} (置信度: {confidence:.2f})")
# 如果置信度太低使用UTF-8作为默认
if confidence < 0.7:
logger.warning(f"编码置信度较低({confidence:.2f})将尝试UTF-8")
return 'utf-8'
return encoding if encoding else 'utf-8'
except Exception as e:
logger.warning(f"编码检测失败: {str(e)}使用UTF-8")
return 'utf-8'
def read_with_fallback(file_path: str, primary_encoding: str) -> tuple[str, str]:
"""
尝试多种编码读取文件
Args:
file_path: 文件路径
primary_encoding: 首选编码
Returns:
(文本内容, 实际使用的编码)
"""
# 编码尝试列表(按优先级)
encodings = [
primary_encoding,
'utf-8',
'utf-8-sig', # UTF-8 with BOM
'gbk',
'gb2312',
'gb18030',
'latin-1',
'cp1252',
'iso-8859-1'
]
# 去重并保持顺序
seen = set()
unique_encodings = []
for enc in encodings:
if enc and enc.lower() not in seen:
seen.add(enc.lower())
unique_encodings.append(enc)
# 尝试每种编码
for encoding in unique_encodings:
try:
with open(file_path, 'r', encoding=encoding, errors='strict') as f:
text = f.read()
logger.info(f"成功使用编码: {encoding}")
return text, encoding
except UnicodeDecodeError:
logger.debug(f"编码 {encoding} 解码失败,尝试下一个")
continue
except Exception as e:
logger.warning(f"使用编码 {encoding} 读取失败: {str(e)}")
continue
# 所有编码都失败
logger.error("所有编码尝试均失败")
return None, None
def validate_txt_file(file_path: str) -> Dict[str, Any]:
"""
验证Txt文件的有效性
Args:
file_path: 文件路径
Returns:
{
"valid": True/False,
"reason": "原因",
"file_info": {文件信息}
}
"""
try:
file_path_obj = Path(file_path)
# 检查文件存在
if not file_path_obj.exists():
return {
"valid": False,
"reason": "文件不存在"
}
# 检查后缀
if file_path_obj.suffix.lower() != '.txt':
return {
"valid": False,
"reason": f"不支持的格式: {file_path_obj.suffix}(仅支持.txt"
}
# 检查文件大小限制10MBtxt文件通常较小
file_size = file_path_obj.stat().st_size
max_size = 10 * 1024 * 1024 # 10MB
if file_size > max_size:
return {
"valid": False,
"reason": f"文件过大: {file_size / 1024 / 1024:.2f}MB限制10MB"
}
if file_size == 0:
return {
"valid": False,
"reason": "文件为空"
}
# 尝试检测编码
encoding = detect_encoding(str(file_path_obj))
return {
"valid": True,
"reason": "文件有效",
"file_info": {
"filename": file_path_obj.name,
"size": file_size,
"size_kb": round(file_size / 1024, 2),
"detected_encoding": encoding
}
}
except Exception as e:
return {
"valid": False,
"reason": f"验证失败: {str(e)}"
}
def preview_txt(file_path: str, lines: int = 10) -> Dict[str, Any]:
"""
预览Txt文件前几行
Args:
file_path: 文件路径
lines: 预览行数
Returns:
{
"success": True,
"preview": "前N行内容",
"total_lines": 总行数(如果能快速获取)
}
"""
try:
result = extract_txt(file_path)
if not result['success']:
return result
text = result['text']
text_lines = text.split('\n')
preview_lines = text_lines[:lines]
preview = '\n'.join(preview_lines)
return {
"success": True,
"preview": preview,
"total_lines": len(text_lines),
"preview_lines": len(preview_lines)
}
except Exception as e:
return {
"success": False,
"error": str(e),
"preview": ""
}

View File

@@ -0,0 +1,37 @@
@echo off
chcp 65001 >nul
echo ================================
echo 启动文档提取微服务
echo ================================
echo.
REM 检查虚拟环境
if exist venv\Scripts\activate.bat (
echo [1/3] 激活虚拟环境...
call venv\Scripts\activate
) else (
echo 警告: 未找到虚拟环境使用全局Python
)
REM 检查依赖
echo [2/3] 检查依赖...
pip list | findstr "fastapi" >nul
if errorlevel 1 (
echo 依赖未安装,正在安装...
pip install -r requirements.txt
)
REM 启动服务
echo [3/3] 启动服务...
echo.
echo 服务地址: http://localhost:8000
echo 健康检查: http://localhost:8000/api/health
echo API文档: http://localhost:8000/docs
echo.
echo 按 Ctrl+C 停止服务
echo.
uvicorn main:app --host 0.0.0.0 --port 8000 --reload
pause

View File

@@ -0,0 +1,29 @@
这是一个测试文本文件。
用于测试Txt文件提取功能。
AI临床研究平台 - Phase 2 Day 3测试
功能特点:
1. 自动编码检测
2. 支持UTF-8、GBK等多种编码
3. 统计字符数和行数
4. 快速文本提取
测试内容包含:
- 中文字符
- 英文字符 (English characters)
- 数字 123456
- 特殊符号 !@#$%^&*()
多行文本测试:
第一行
第二行
第三行
结束。

View File

@@ -0,0 +1,171 @@
"""
服务测试脚本
测试文档提取微服务的各项功能
"""
import requests
import sys
from pathlib import Path
BASE_URL = "http://localhost:8000"
def test_health():
"""测试健康检查"""
print("\n" + "="*50)
print("测试1: 健康检查")
print("="*50)
try:
response = requests.get(f"{BASE_URL}/api/health")
print(f"状态码: {response.status_code}")
if response.status_code == 200:
data = response.json()
print(f"服务状态: {data['status']}")
print(f"PyMuPDF: {data['checks']['pymupdf']['available']} (v{data['checks']['pymupdf']['version']})")
print(f"临时目录: {data['checks']['temp_dir']['path']}")
print("✅ 健康检查通过")
return True
else:
print("❌ 健康检查失败")
return False
except Exception as e:
print(f"❌ 连接失败: {str(e)}")
print("提示: 请确保服务已启动python main.py")
return False
def test_pdf_extraction(pdf_file: str = None):
"""测试PDF提取"""
print("\n" + "="*50)
print("测试2: PDF文本提取")
print("="*50)
if not pdf_file:
print("跳过: 未提供测试PDF文件")
print("使用方法: python test_service.py <pdf文件路径>")
return None
pdf_path = Path(pdf_file)
if not pdf_path.exists():
print(f"❌ 文件不存在: {pdf_file}")
return False
try:
print(f"上传文件: {pdf_path.name}")
print(f"文件大小: {pdf_path.stat().st_size / 1024:.2f} KB")
with open(pdf_path, 'rb') as f:
files = {'file': (pdf_path.name, f, 'application/pdf')}
response = requests.post(
f"{BASE_URL}/api/extract/pdf",
files=files
)
print(f"状态码: {response.status_code}")
if response.status_code == 200:
data = response.json()
print("\n提取结果:")
print(f"方法: {data['method']}")
print(f"页数: {data['metadata']['page_count']}")
print(f"字符数: {data['metadata']['char_count']}")
print(f"文本长度: {len(data['text'])} 字符")
# 显示前500字符
print("\n文本预览:")
print("-" * 50)
print(data['text'][:500])
if len(data['text']) > 500:
print("...")
print("-" * 50)
print("\n✅ PDF提取成功")
return True
else:
print(f"❌ 提取失败: {response.text}")
return False
except Exception as e:
print(f"❌ 请求失败: {str(e)}")
return False
def test_root():
"""测试根路径"""
print("\n" + "="*50)
print("测试0: 根路径")
print("="*50)
try:
response = requests.get(f"{BASE_URL}/")
print(f"状态码: {response.status_code}")
if response.status_code == 200:
data = response.json()
print(f"服务: {data['service']}")
print(f"版本: {data['version']}")
print("✅ 根路径正常")
return True
else:
print("❌ 根路径异常")
return False
except Exception as e:
print(f"❌ 连接失败: {str(e)}")
return False
def main():
"""主测试函数"""
print("\n" + "="*50)
print("文档提取微服务 - 测试套件")
print("="*50)
# 获取PDF文件路径如果提供
pdf_file = sys.argv[1] if len(sys.argv) > 1 else None
# 运行测试
results = []
results.append(("根路径", test_root()))
results.append(("健康检查", test_health()))
results.append(("PDF提取", test_pdf_extraction(pdf_file)))
# 总结
print("\n" + "="*50)
print("测试总结")
print("="*50)
for name, result in results:
if result is True:
status = "✅ 通过"
elif result is False:
status = "❌ 失败"
else:
status = "⏭️ 跳过"
print(f"{name}: {status}")
passed = sum(1 for _, r in results if r is True)
total = len([r for _, r in results if r is not None])
print(f"\n通过率: {passed}/{total}")
if passed == total:
print("\n🎉 所有测试通过!")
else:
print("\n⚠️ 部分测试失败")
if __name__ == "__main__":
main()