From 39eb62ee793555c9c093af8076a6686dffe9dad3 Mon Sep 17 00:00:00 2001 From: AI Clinical Dev Team Date: Sun, 16 Nov 2025 15:32:44 +0800 Subject: [PATCH] feat: add extraction_service (PDF/Docx/Txt) and update .gitignore to exclude venv --- .gitignore | 72 +++ extraction_service/.gitignore | 40 ++ extraction_service/README.md | 181 +++++++ extraction_service/install.bat | 89 +++ extraction_service/install_nougat.bat | 88 +++ extraction_service/main.py | 508 ++++++++++++++++++ extraction_service/requirements.txt | 31 ++ extraction_service/services/__init__.py | 11 + extraction_service/services/docx_extractor.py | 257 +++++++++ extraction_service/services/file_utils.py | 88 +++ .../services/language_detector.py | 160 ++++++ .../services/nougat_extractor.py | 241 +++++++++ extraction_service/services/pdf_extractor.py | 191 +++++++ extraction_service/services/pdf_processor.py | 192 +++++++ extraction_service/services/txt_extractor.py | 320 +++++++++++ extraction_service/start.bat | 37 ++ extraction_service/test_files/test.txt | 29 + extraction_service/test_service.py | 171 ++++++ 18 files changed, 2706 insertions(+) create mode 100644 extraction_service/.gitignore create mode 100644 extraction_service/README.md create mode 100644 extraction_service/install.bat create mode 100644 extraction_service/install_nougat.bat create mode 100644 extraction_service/main.py create mode 100644 extraction_service/requirements.txt create mode 100644 extraction_service/services/__init__.py create mode 100644 extraction_service/services/docx_extractor.py create mode 100644 extraction_service/services/file_utils.py create mode 100644 extraction_service/services/language_detector.py create mode 100644 extraction_service/services/nougat_extractor.py create mode 100644 extraction_service/services/pdf_extractor.py create mode 100644 extraction_service/services/pdf_processor.py create mode 100644 extraction_service/services/txt_extractor.py create mode 100644 extraction_service/start.bat create mode 100644 extraction_service/test_files/test.txt create mode 100644 extraction_service/test_service.py diff --git a/.gitignore b/.gitignore index 28bb1399..da1252d8 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,78 @@ tmp/ temp/ *.tmp +# ==================== Python ==================== +# Virtual environments (重要!避免提交 2+ GB 的依赖) +venv/ +env/ +.venv/ +ENV/ +env.bak/ +venv.bak/ +# Python cache +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +# PyInstaller +*.manifest +*.spec +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# Celery +celerybeat-schedule +celerybeat.pid + +# Environments +.env +.venv + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Unit test / coverage +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre +.pyre/ + +# pytype +.pytype/ + +# Cython +cython_debug/ diff --git a/extraction_service/.gitignore b/extraction_service/.gitignore new file mode 100644 index 00000000..45804457 --- /dev/null +++ b/extraction_service/.gitignore @@ -0,0 +1,40 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +venv/ +ENV/ +build/ +dist/ +*.egg-info/ + +# 环境变量 +.env + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# 临时文件 +/tmp/ +*.log + +# 测试 +.pytest_cache/ +.coverage +htmlcov/ + +# OS +.DS_Store +Thumbs.db + + + + + + diff --git a/extraction_service/README.md b/extraction_service/README.md new file mode 100644 index 00000000..0f8a5753 --- /dev/null +++ b/extraction_service/README.md @@ -0,0 +1,181 @@ +# 文档提取微服务 + +基于FastAPI的文档文本提取服务,支持PDF、Docx、Txt格式。 + +## 功能特性 + +- ✅ **PDF提取**:使用PyMuPDF快速提取PDF文本 +- ⏳ **Docx提取**:使用Mammoth提取Word文档(Day 3) +- ⏳ **Txt提取**:支持多种编码(Day 3) +- ⏳ **语言检测**:自动检测PDF语言(Day 2) +- ⏳ **Nougat集成**:高质量学术PDF解析(Day 2) + +## 快速开始 + +### 1. 安装依赖 + +```bash +cd extraction_service + +# 创建虚拟环境(推荐) +python -m venv venv +source venv/bin/activate # Windows: venv\Scripts\activate + +# 安装依赖 +pip install -r requirements.txt +``` + +### 2. 配置环境变量 + +```bash +# 复制示例配置 +cp .env.example .env + +# 编辑配置(可选) +# SERVICE_PORT=8000 +# DEBUG=True +``` + +### 3. 启动服务 + +```bash +# 开发模式(自动重载) +python main.py + +# 或使用uvicorn +uvicorn main:app --reload --port 8000 +``` + +服务将在 http://localhost:8000 启动 + +### 4. 测试服务 + +#### 健康检查 + +```bash +curl http://localhost:8000/api/health +``` + +返回: +```json +{ + "status": "healthy", + "checks": { + "pymupdf": { + "available": true, + "version": "1.23.8" + }, + "temp_dir": { + "path": "/tmp/extraction_service", + "writable": true + } + } +} +``` + +#### PDF文本提取 + +```bash +curl -X POST http://localhost:8000/api/extract/pdf \ + -F "file=@test.pdf" +``` + +返回: +```json +{ + "success": true, + "method": "pymupdf", + "text": "提取的文本内容...", + "metadata": { + "page_count": 20, + "char_count": 50000, + "file_size": 1024000, + "filename": "test.pdf" + } +} +``` + +## API文档 + +启动服务后访问: +- Swagger UI: http://localhost:8000/docs +- ReDoc: http://localhost:8000/redoc + +## 项目结构 + +``` +extraction_service/ +├── main.py # 主应用入口 +├── requirements.txt # Python依赖 +├── .env.example # 环境变量示例 +├── README.md # 本文件 +├── services/ # 服务模块 +│ ├── __init__.py +│ ├── pdf_extractor.py # PDF提取(PyMuPDF) +│ ├── nougat_extractor.py # Nougat提取(Day 2) +│ ├── docx_extractor.py # Docx提取(Day 3) +│ ├── txt_extractor.py # Txt提取(Day 3) +│ ├── language_detector.py # 语言检测(Day 2) +│ └── file_utils.py # 文件工具 +└── tests/ # 测试文件(待添加) +``` + +## 开发计划 + +### ✅ Day 1(已完成) +- [x] FastAPI项目搭建 +- [x] PyMuPDF集成 +- [x] PDF文本提取功能 +- [x] 健康检查API + +### ⏳ Day 2(进行中) +- [ ] 安装Nougat +- [ ] 语言检测功能 +- [ ] Nougat提取逻辑 +- [ ] 顺序降级机制 + +### ⏳ Day 3 +- [ ] Docx提取(Mammoth) +- [ ] Txt提取(多编码) +- [ ] 文件格式验证 + +## 依赖说明 + +| 库 | 版本 | 用途 | +|---|---|---| +| fastapi | 0.104.1 | Web框架 | +| uvicorn | 0.24.0 | ASGI服务器 | +| PyMuPDF | 1.23.8 | PDF文本提取 | +| pdfplumber | 0.10.3 | PDF语言检测 | +| mammoth | 1.6.0 | Docx提取 | +| langdetect | 1.0.9 | 语言检测 | +| loguru | 0.7.2 | 日志管理 | + +## 性能指标 + +| 操作 | 目标时间 | +|---|---| +| 20页PDF(PyMuPDF) | <30秒 | +| 10页Docx | <10秒 | +| 1MB Txt | <5秒 | + +## 常见问题 + +### Q: PyMuPDF安装失败? +A: 确保Python版本>=3.8,使用pip安装:`pip install PyMuPDF` + +### Q: 服务无法启动? +A: 检查端口8000是否被占用,可修改.env中的SERVICE_PORT + +### Q: 临时文件在哪里? +A: 默认在/tmp/extraction_service目录,可通过TEMP_DIR环境变量配置 + +## License + +MIT + + + + + + diff --git a/extraction_service/install.bat b/extraction_service/install.bat new file mode 100644 index 00000000..51376170 --- /dev/null +++ b/extraction_service/install.bat @@ -0,0 +1,89 @@ +@echo off +chcp 65001 >nul +echo ================================ +echo 安装文档提取微服务依赖 +echo ================================ +echo. + +REM 检查Python +echo [1/5] 检查Python环境... +python --version >nul 2>&1 +if errorlevel 1 ( + echo ❌ 错误: 未找到Python + echo 请先安装Python 3.8或更高版本 + echo 下载地址: https://www.python.org/downloads/ + pause + exit /b 1 +) + +python --version +echo ✅ Python已安装 +echo. + +REM 创建虚拟环境 +echo [2/5] 创建虚拟环境... +if exist venv ( + echo 虚拟环境已存在,跳过创建 +) else ( + python -m venv venv + if errorlevel 1 ( + echo ❌ 创建虚拟环境失败 + pause + exit /b 1 + ) + echo ✅ 虚拟环境创建成功 +) +echo. + +REM 激活虚拟环境 +echo [3/5] 激活虚拟环境... +call venv\Scripts\activate +if errorlevel 1 ( + echo ❌ 激活虚拟环境失败 + pause + exit /b 1 +) +echo ✅ 虚拟环境已激活 +echo. + +REM 升级pip +echo [4/5] 升级pip... +python -m pip install --upgrade pip +echo. + +REM 安装依赖 +echo [5/5] 安装依赖包... +echo 这可能需要几分钟时间... +pip install -r requirements.txt +if errorlevel 1 ( + echo ❌ 依赖安装失败 + pause + exit /b 1 +) +echo. + +REM 验证安装 +echo ================================ +echo 验证安装 +echo ================================ +python -c "import fastapi; print('✅ FastAPI:', fastapi.__version__)" +python -c "import fitz; print('✅ PyMuPDF:', fitz.__version__)" +python -c "import uvicorn; print('✅ Uvicorn: OK')" +echo. + +echo ================================ +echo 🎉 安装完成! +echo ================================ +echo. +echo 下一步: +echo 1. 启动服务: start.bat +echo 2. 测试服务: python test_service.py +echo. + +pause + + + + + + diff --git a/extraction_service/install_nougat.bat b/extraction_service/install_nougat.bat new file mode 100644 index 00000000..c005d9b6 --- /dev/null +++ b/extraction_service/install_nougat.bat @@ -0,0 +1,88 @@ +@echo off +chcp 65001 >nul +echo ================================ +echo 安装Nougat OCR +echo ================================ +echo. + +echo ⚠️ 注意事项: +echo 1. Nougat需要Python 3.8+ +echo 2. 首次运行会下载模型文件(约350MB) +echo 3. 建议使用GPU加速(需CUDA) +echo 4. 安装可能需要5-10分钟 +echo. +pause + +REM 激活虚拟环境 +if exist venv\Scripts\activate.bat ( + echo [1/4] 激活虚拟环境... + call venv\Scripts\activate +) else ( + echo 错误: 请先运行 install.bat 创建虚拟环境 + pause + exit /b 1 +) + +REM 安装Nougat +echo. +echo [2/4] 安装Nougat OCR... +echo 这可能需要几分钟时间... +echo. + +pip install nougat-ocr==0.1.17 + +if errorlevel 1 ( + echo. + echo ❌ Nougat安装失败 + echo. + echo 可能的原因: + echo 1. 网络问题:请使用国内镜像源 + echo 2. Python版本:需要Python 3.8+ + echo 3. 依赖冲突:可能需要新的虚拟环境 + echo. + echo 替代方案: + echo - 如果只使用中文PDF,可以不安装Nougat + echo - 系统会自动降级使用PyMuPDF + echo. + pause + exit /b 1 +) + +echo. +echo [3/4] 验证安装... +python -c "import nougat; print('✅ Nougat导入成功')" + +echo. +echo [4/4] 测试Nougat命令... +nougat --version + +if errorlevel 1 ( + echo ⚠️ 命令行工具未找到,但Python模块已安装 + echo 这可能不影响使用,系统会尝试直接调用Python模块 +) else ( + echo ✅ Nougat命令行工具正常 +) + +echo. +echo ================================ +echo 🎉 Nougat安装完成! +echo ================================ +echo. +echo 说明: +echo - Nougat擅长处理英文学术PDF +echo - 能保留表格、公式等结构 +echo - 中文PDF会自动使用PyMuPDF +echo - 首次使用会下载模型(约350MB) +echo. +echo 下一步: +echo - 启动服务: start.bat +echo - 健康检查: curl http://localhost:8000/api/health +echo. + +pause + + + + + + diff --git a/extraction_service/main.py b/extraction_service/main.py new file mode 100644 index 00000000..5e6e6b83 --- /dev/null +++ b/extraction_service/main.py @@ -0,0 +1,508 @@ +""" +文档提取微服务 - 主入口 + +功能: +- PDF文本提取(PyMuPDF) +- Docx文本提取(Mammoth) +- Txt文本提取(直接读取) +- 语言检测 +- 健康检查 +""" + +from fastapi import FastAPI, File, UploadFile, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse +from loguru import logger +from pathlib import Path +import os +import sys +from datetime import datetime +from dotenv import load_dotenv + +# 加载环境变量 +load_dotenv() + +# 配置日志 +logger.remove() +logger.add( + sys.stdout, + format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}", + level=os.getenv("LOG_LEVEL", "INFO") +) + +# 创建FastAPI应用 +app = FastAPI( + title="文档提取微服务", + description="提供PDF、Docx、Txt文档的文本提取服务", + version="1.0.0", +) + +# CORS配置 +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # 生产环境应该限制具体域名 + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# 临时文件目录 +TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service")) +TEMP_DIR.mkdir(parents=True, exist_ok=True) + +# 导入服务模块 +from services.pdf_extractor import extract_pdf_pymupdf +from services.pdf_processor import extract_pdf, get_pdf_processing_strategy +from services.language_detector import detect_language, detect_language_detailed +from services.nougat_extractor import check_nougat_available, get_nougat_info +from services.file_utils import detect_file_type, cleanup_temp_file +from services.docx_extractor import extract_docx_mammoth, validate_docx_file +from services.txt_extractor import extract_txt, validate_txt_file + + +# ==================== API路由 ==================== + +@app.get("/") +async def root(): + """根路径""" + return { + "service": "文档提取微服务", + "version": "1.0.0", + "status": "running" + } + + +@app.get("/api/health") +async def health_check(): + """ + 健康检查接口 + + 检查项: + - 服务是否运行 + - PyMuPDF是否可用 + - Nougat是否可用 + - 临时目录是否可写 + """ + try: + import fitz # PyMuPDF + pymupdf_version = fitz.__version__ + pymupdf_available = True + except Exception as e: + pymupdf_version = "unknown" + pymupdf_available = False + logger.warning(f"PyMuPDF不可用: {str(e)}") + + # 检查Nougat + nougat_info = get_nougat_info() + + # 检查临时目录 + temp_dir_writable = TEMP_DIR.exists() and os.access(TEMP_DIR, os.W_OK) + + return { + "status": "healthy" if (pymupdf_available and temp_dir_writable) else "degraded", + "checks": { + "pymupdf": { + "available": pymupdf_available, + "version": pymupdf_version + }, + "nougat": nougat_info, + "temp_dir": { + "path": str(TEMP_DIR), + "writable": temp_dir_writable + } + }, + "timestamp": datetime.now().isoformat() + } + + +@app.post("/api/extract/pdf") +async def extract_pdf_endpoint( + file: UploadFile = File(...), + method: str = "auto" +): + """ + PDF文本提取接口(智能选择方法) + + Args: + file: 上传的PDF文件 + method: 提取方法 ('auto' | 'nougat' | 'pymupdf') + - auto: 自动选择(默认) + - nougat: 强制使用Nougat + - pymupdf: 强制使用PyMuPDF + + Returns: + { + "success": true, + "method": "nougat" | "pymupdf", + "reason": "...", + "text": "提取的文本内容", + "metadata": {...} + } + """ + temp_path = None + + try: + # 验证文件类型 + if not file.filename.lower().endswith('.pdf'): + raise HTTPException( + status_code=400, + detail="文件格式错误,只支持PDF文件" + ) + + # 保存临时文件 + temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}" + + logger.info(f"开始处理PDF文件: {file.filename}, 方法={method}") + + with open(temp_path, "wb") as f: + content = await file.read() + f.write(content) + + file_size = len(content) + logger.info(f"文件大小: {file_size / 1024:.2f} KB") + + # 提取文本(使用顺序降级策略) + force_method = None if method == "auto" else method + result = extract_pdf(str(temp_path), force_method=force_method) + + if not result["success"]: + raise HTTPException( + status_code=500, + detail=f"PDF提取失败: {result.get('error', 'Unknown error')}" + ) + + # 添加文件元数据 + result["metadata"]["file_size"] = file_size + result["metadata"]["filename"] = file.filename + + logger.info(f"PDF提取成功: {file.filename}, " + f"方法={result['method']}, " + f"原因={result.get('reason', 'N/A')}") + + return JSONResponse(content=result) + + except HTTPException: + raise + except Exception as e: + logger.error(f"PDF提取失败: {str(e)}") + raise HTTPException( + status_code=500, + detail=f"处理失败: {str(e)}" + ) + + finally: + # 清理临时文件 + if temp_path: + cleanup_temp_file(temp_path) + + +@app.post("/api/detect-language") +async def detect_language_endpoint(file: UploadFile = File(...)): + """ + PDF语言检测接口 + + Args: + file: 上传的PDF文件 + + Returns: + { + "language": "chinese" | "english" | "mixed", + "chinese_ratio": 0.65, + "chinese_chars": 3500, + "total_chars": 5000 + } + """ + temp_path = None + + try: + if not file.filename.lower().endswith('.pdf'): + raise HTTPException(status_code=400, detail="只支持PDF文件") + + # 保存临时文件 + temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}" + + with open(temp_path, "wb") as f: + content = await file.read() + f.write(content) + + # 检测语言 + result = detect_language_detailed(str(temp_path)) + result["filename"] = file.filename + + return JSONResponse(content=result) + + except HTTPException: + raise + except Exception as e: + logger.error(f"语言检测失败: {str(e)}") + raise HTTPException(status_code=500, detail=f"检测失败: {str(e)}") + + finally: + if temp_path: + cleanup_temp_file(temp_path) + + +@app.post("/api/pdf-strategy") +async def get_strategy_endpoint(file: UploadFile = File(...)): + """ + 获取PDF处理策略(不实际提取) + + Args: + file: 上传的PDF文件 + + Returns: + { + "detected_language": "chinese" | "english", + "recommended_method": "nougat" | "pymupdf", + "reason": "...", + "nougat_available": true + } + """ + temp_path = None + + try: + if not file.filename.lower().endswith('.pdf'): + raise HTTPException(status_code=400, detail="只支持PDF文件") + + temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}" + + with open(temp_path, "wb") as f: + content = await file.read() + f.write(content) + + # 获取处理策略 + result = get_pdf_processing_strategy(str(temp_path)) + result["filename"] = file.filename + + return JSONResponse(content=result) + + except HTTPException: + raise + except Exception as e: + logger.error(f"获取策略失败: {str(e)}") + raise HTTPException(status_code=500, detail=f"失败: {str(e)}") + + finally: + if temp_path: + cleanup_temp_file(temp_path) + + +@app.post("/api/extract/docx") +async def extract_docx_endpoint(file: UploadFile = File(...)): + """ + Docx文档提取接口 + + Args: + file: 上传的Docx文件 + + Returns: + { + "success": true, + "method": "mammoth", + "text": "提取的文本内容", + "metadata": { + "char_count": 字符数, + "has_tables": 是否包含表格, + "file_size": 文件大小 + } + } + """ + temp_path = None + + try: + # 验证文件类型 + if not file.filename.lower().endswith('.docx'): + raise HTTPException( + status_code=400, + detail="文件格式错误,只支持Docx文件" + ) + + # 保存临时文件 + temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}" + + logger.info(f"开始处理Docx文件: {file.filename}") + + with open(temp_path, "wb") as f: + content = await file.read() + f.write(content) + + file_size = len(content) + logger.info(f"文件大小: {file_size / 1024:.2f} KB") + + # 提取文本 + result = extract_docx_mammoth(str(temp_path)) + + if not result["success"]: + raise HTTPException( + status_code=500, + detail=f"Docx提取失败: {result.get('error', 'Unknown error')}" + ) + + # 添加文件元数据 + result["method"] = "mammoth" + result["metadata"]["filename"] = file.filename + + logger.info(f"Docx提取成功: {file.filename}, " + f"字符数={result['metadata']['char_count']}") + + return JSONResponse(content=result) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Docx提取失败: {str(e)}") + raise HTTPException( + status_code=500, + detail=f"处理失败: {str(e)}" + ) + + finally: + if temp_path: + cleanup_temp_file(temp_path) + + +@app.post("/api/extract/txt") +async def extract_txt_endpoint(file: UploadFile = File(...)): + """ + Txt文本文件提取接口 + + Args: + file: 上传的Txt文件 + + Returns: + { + "success": true, + "method": "direct", + "text": "文本内容", + "encoding": "utf-8", + "metadata": { + "char_count": 字符数, + "line_count": 行数, + "file_size": 文件大小 + } + } + """ + temp_path = None + + try: + # 验证文件类型 + if not file.filename.lower().endswith('.txt'): + raise HTTPException( + status_code=400, + detail="文件格式错误,只支持Txt文件" + ) + + # 保存临时文件 + temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}" + + logger.info(f"开始处理Txt文件: {file.filename}") + + with open(temp_path, "wb") as f: + content = await file.read() + f.write(content) + + file_size = len(content) + logger.info(f"文件大小: {file_size / 1024:.2f} KB") + + # 提取文本 + result = extract_txt(str(temp_path)) + + if not result["success"]: + raise HTTPException( + status_code=500, + detail=f"Txt提取失败: {result.get('error', 'Unknown error')}" + ) + + # 添加方法标识和文件名 + result["method"] = "direct" + result["metadata"]["filename"] = file.filename + + logger.info(f"Txt提取成功: {file.filename}, " + f"编码={result['encoding']}, " + f"字符数={result['metadata']['char_count']}") + + return JSONResponse(content=result) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Txt提取失败: {str(e)}") + raise HTTPException( + status_code=500, + detail=f"处理失败: {str(e)}" + ) + + finally: + if temp_path: + cleanup_temp_file(temp_path) + + +@app.post("/api/extract") +async def extract_document( + file: UploadFile = File(...), + file_type: str = None +): + """ + 通用文档提取接口 + + 自动检测文件类型并调用相应的提取方法 + + Args: + file: 上传的文件 + file_type: 可选,指定文件类型 ('pdf' | 'docx' | 'txt') + + Returns: + 提取结果 + """ + try: + # 自动检测文件类型 + if not file_type: + file_type = detect_file_type(file.filename) + + logger.info(f"文件类型: {file_type}, 文件名: {file.filename}") + + # 根据类型调用不同的处理函数 + if file_type == 'pdf': + return await extract_pdf_endpoint(file) + elif file_type == 'docx': + return await extract_docx_endpoint(file) + elif file_type == 'txt': + return await extract_txt_endpoint(file) + else: + raise HTTPException( + status_code=400, + detail=f"不支持的文件格式: {file_type},仅支持PDF、Docx、Txt" + ) + + except HTTPException: + raise + except Exception as e: + logger.error(f"文档提取失败: {str(e)}") + raise HTTPException( + status_code=500, + detail=f"处理失败: {str(e)}" + ) + + +# ==================== 启动配置 ==================== + +if __name__ == "__main__": + import uvicorn + + port = int(os.getenv("SERVICE_PORT", 8000)) + host = os.getenv("SERVICE_HOST", "0.0.0.0") + debug = os.getenv("DEBUG", "True").lower() == "true" + + logger.info(f"启动文档提取微服务...") + logger.info(f"地址: http://{host}:{port}") + logger.info(f"健康检查: http://{host}:{port}/api/health") + logger.info(f"调试模式: {debug}") + + uvicorn.run( + "main:app", + host=host, + port=port, + reload=debug, + log_level="info" + ) + diff --git a/extraction_service/requirements.txt b/extraction_service/requirements.txt new file mode 100644 index 00000000..cf136361 --- /dev/null +++ b/extraction_service/requirements.txt @@ -0,0 +1,31 @@ +# FastAPI核心依赖 +fastapi==0.104.1 +uvicorn[standard]==0.24.0 +python-multipart==0.0.6 + +# PDF处理 +PyMuPDF>=1.24.0 # 使用更新版本,有预编译wheel +pdfplumber==0.10.3 +nougat-ocr==0.1.17 # 学术PDF高质量提取(英文) +albumentations==1.3.1 # Nougat兼容版本(不要升级到2.x) + +# Docx处理(Day 3需要) +mammoth==1.6.0 +python-docx==1.1.0 + +# 语言检测(Day 2需要) +langdetect==1.0.9 + +# 编码检测(Day 3需要) +chardet==5.2.0 + +# 工具 +python-dotenv==1.0.0 +pydantic>=2.10.0 # 使用更新版本,有预编译wheel + +# 日志 +loguru==0.7.2 + +# 测试工具 +requests==2.31.0 + diff --git a/extraction_service/services/__init__.py b/extraction_service/services/__init__.py new file mode 100644 index 00000000..e9a7402c --- /dev/null +++ b/extraction_service/services/__init__.py @@ -0,0 +1,11 @@ +""" +服务模块 + +包含各种文档提取和处理服务 +""" + + + + + + diff --git a/extraction_service/services/docx_extractor.py b/extraction_service/services/docx_extractor.py new file mode 100644 index 00000000..e911f2a9 --- /dev/null +++ b/extraction_service/services/docx_extractor.py @@ -0,0 +1,257 @@ +""" +Docx文档提取服务 + +使用Mammoth库提取Word文档文本 +支持.docx格式(不支持老版.doc) +""" + +import mammoth +from pathlib import Path +from typing import Dict, Any +from loguru import logger + + +def extract_docx_mammoth(file_path: str) -> Dict[str, Any]: + """ + 使用Mammoth提取Docx文本 + + Mammoth特点: + - 转换为纯文本或HTML + - 保留基本格式信息 + - 处理表格、列表等结构 + + Args: + file_path: Docx文件路径 + + Returns: + { + "success": True, + "text": "提取的文本内容", + "format": "plain_text", + "metadata": { + "char_count": 字符数, + "has_tables": 是否包含表格, + "file_size": 文件大小 + } + } + """ + try: + file_path_obj = Path(file_path) + + # 验证文件存在 + if not file_path_obj.exists(): + return { + "success": False, + "error": f"文件不存在: {file_path}", + "text": "", + "metadata": {} + } + + # 验证文件格式 + if file_path_obj.suffix.lower() != '.docx': + return { + "success": False, + "error": f"不支持的文件格式: {file_path_obj.suffix},仅支持.docx", + "text": "", + "metadata": {} + } + + logger.info(f"开始提取Docx文件: {file_path_obj.name}") + + # 使用Mammoth提取纯文本 + with open(file_path, "rb") as docx_file: + result = mammoth.extract_raw_text(docx_file) + text = result.value # 提取的文本 + messages = result.messages # 警告/错误信息 + + # 检查是否有警告 + if messages: + logger.warning(f"Mammoth提取警告: {len(messages)}个") + for msg in messages: + logger.debug(f" - {msg.type}: {msg.message}") + + # 简单的质量检查 + char_count = len(text) + if char_count == 0: + logger.warning("提取的文本为空") + return { + "success": False, + "error": "文档内容为空或无法提取", + "text": "", + "metadata": { + "char_count": 0, + "file_size": file_path_obj.stat().st_size + } + } + + # 简单判断是否包含表格(通过制表符或特殊结构) + has_tables = '\t' in text or '|' in text + + logger.info(f"Docx提取成功: {char_count}个字符") + + return { + "success": True, + "text": text, + "format": "plain_text", + "metadata": { + "char_count": char_count, + "has_tables": has_tables, + "file_size": file_path_obj.stat().st_size, + "warnings": len(messages) + } + } + + except Exception as e: + logger.error(f"Docx提取失败: {str(e)}") + return { + "success": False, + "error": str(e), + "text": "", + "metadata": {} + } + + +def extract_docx_html(file_path: str) -> Dict[str, Any]: + """ + 使用Mammoth提取Docx为HTML格式(保留更多格式) + + Args: + file_path: Docx文件路径 + + Returns: + { + "success": True, + "html": "HTML格式的文本", + "format": "html", + "metadata": {...} + } + """ + try: + file_path_obj = Path(file_path) + + if not file_path_obj.exists(): + return { + "success": False, + "error": f"文件不存在: {file_path}", + "html": "", + "metadata": {} + } + + logger.info(f"开始提取Docx为HTML: {file_path_obj.name}") + + # 提取为HTML + with open(file_path, "rb") as docx_file: + result = mammoth.convert_to_html(docx_file) + html = result.value + messages = result.messages + + if messages: + logger.warning(f"HTML转换警告: {len(messages)}个") + + logger.info(f"HTML提取成功: {len(html)}个字符") + + return { + "success": True, + "html": html, + "format": "html", + "metadata": { + "html_length": len(html), + "file_size": file_path_obj.stat().st_size, + "warnings": len(messages) + } + } + + except Exception as e: + logger.error(f"HTML提取失败: {str(e)}") + return { + "success": False, + "error": str(e), + "html": "", + "metadata": {} + } + + +def validate_docx_file(file_path: str) -> Dict[str, Any]: + """ + 验证Docx文件的有效性 + + Args: + file_path: 文件路径 + + Returns: + { + "valid": True/False, + "reason": "原因", + "file_info": {文件信息} + } + """ + try: + file_path_obj = Path(file_path) + + # 检查文件存在 + if not file_path_obj.exists(): + return { + "valid": False, + "reason": "文件不存在" + } + + # 检查后缀 + if file_path_obj.suffix.lower() != '.docx': + return { + "valid": False, + "reason": f"不支持的格式: {file_path_obj.suffix}(仅支持.docx)" + } + + # 检查文件大小(限制50MB) + file_size = file_path_obj.stat().st_size + max_size = 50 * 1024 * 1024 # 50MB + + if file_size > max_size: + return { + "valid": False, + "reason": f"文件过大: {file_size / 1024 / 1024:.2f}MB(限制50MB)" + } + + if file_size == 0: + return { + "valid": False, + "reason": "文件为空" + } + + # 尝试打开文件(基本有效性检查) + try: + with open(file_path, "rb") as f: + # 读取前4个字节检查ZIP签名(docx本质是ZIP文件) + signature = f.read(4) + if signature != b'PK\x03\x04': + return { + "valid": False, + "reason": "不是有效的Docx文件(ZIP签名错误)" + } + except Exception as e: + return { + "valid": False, + "reason": f"无法读取文件: {str(e)}" + } + + return { + "valid": True, + "reason": "文件有效", + "file_info": { + "filename": file_path_obj.name, + "size": file_size, + "size_mb": round(file_size / 1024 / 1024, 2) + } + } + + except Exception as e: + return { + "valid": False, + "reason": f"验证失败: {str(e)}" + } + + + + + + diff --git a/extraction_service/services/file_utils.py b/extraction_service/services/file_utils.py new file mode 100644 index 00000000..55f51334 --- /dev/null +++ b/extraction_service/services/file_utils.py @@ -0,0 +1,88 @@ +""" +文件工具函数 +""" + +import os +from pathlib import Path +from loguru import logger + + +def detect_file_type(filename: str) -> str: + """ + 根据文件名检测文件类型 + + Args: + filename: 文件名 + + Returns: + 文件类型: 'pdf' | 'docx' | 'txt' + + Raises: + ValueError: 不支持的文件格式 + """ + ext = filename.lower().split('.')[-1] + + if ext == 'pdf': + return 'pdf' + elif ext == 'docx': + return 'docx' + elif ext == 'txt': + return 'txt' + else: + raise ValueError(f"不支持的文件格式: .{ext}") + + +def cleanup_temp_file(file_path: Path | str) -> None: + """ + 清理临时文件 + + Args: + file_path: 文件路径 + """ + try: + if isinstance(file_path, str): + file_path = Path(file_path) + + if file_path.exists(): + file_path.unlink() + logger.debug(f"清理临时文件: {file_path}") + except Exception as e: + logger.warning(f"清理临时文件失败: {str(e)}") + + +def get_file_size_mb(file_path: Path | str) -> float: + """ + 获取文件大小(MB) + + Args: + file_path: 文件路径 + + Returns: + 文件大小(MB) + """ + if isinstance(file_path, str): + file_path = Path(file_path) + + if file_path.exists(): + return file_path.stat().st_size / (1024 * 1024) + return 0.0 + + +def validate_file_size(file_size: int, max_size: int = 52428800) -> bool: + """ + 验证文件大小 + + Args: + file_size: 文件大小(字节) + max_size: 最大允许大小(字节),默认50MB + + Returns: + 是否通过验证 + """ + return file_size <= max_size + + + + + + diff --git a/extraction_service/services/language_detector.py b/extraction_service/services/language_detector.py new file mode 100644 index 00000000..db03ad3b --- /dev/null +++ b/extraction_service/services/language_detector.py @@ -0,0 +1,160 @@ +""" +语言检测服务 + +检测PDF文档的主要语言(中文/英文/混合) +用于决定使用哪种提取方法 +""" + +import pdfplumber +from typing import Dict, Any +from loguru import logger + + +def detect_language(pdf_path: str) -> str: + """ + 检测PDF主要语言 + + 策略: + 1. 提取前3页文本(代表性强) + 2. 统计中文字符比例 + 3. 判断语言类型 + + Args: + pdf_path: PDF文件路径 + + Returns: + 'chinese' | 'english' | 'mixed' + """ + try: + logger.info(f"开始语言检测: {pdf_path}") + + with pdfplumber.open(pdf_path) as pdf: + # 提取前3页文本(或全部如果少于3页) + sample_pages = min(3, len(pdf.pages)) + sample_text = "" + + for i in range(sample_pages): + try: + page_text = pdf.pages[i].extract_text() + if page_text: + sample_text += page_text + "\n" + except Exception as e: + logger.warning(f"第{i+1}页文本提取失败: {str(e)}") + continue + + # 检查是否有足够文本 + if len(sample_text.strip()) < 100: + logger.warning("文本太少,默认使用英文处理") + return 'english' + + # 统计中文字符比例 + chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff']) + total_chars = len([c for c in sample_text if c.strip()]) + + if total_chars == 0: + logger.warning("无有效字符,默认使用英文处理") + return 'english' + + chinese_ratio = chinese_chars / total_chars + + logger.info(f"中文字符比例: {chinese_ratio:.2%} ({chinese_chars}/{total_chars})") + + # 判断语言 + # 阈值说明: + # - > 30%: 判定为中文PDF(包括中英混合但中文为主) + # - <= 30%: 判定为英文PDF + if chinese_ratio > 0.3: + language = 'chinese' + else: + language = 'english' + + logger.info(f"检测结果: {language}") + return language + + except Exception as e: + logger.error(f"语言检测失败: {str(e)},默认使用英文处理") + return 'english' + + +def detect_language_detailed(pdf_path: str) -> Dict[str, Any]: + """ + 详细的语言检测 + + 返回更多统计信息 + + Args: + pdf_path: PDF文件路径 + + Returns: + { + "language": "chinese" | "english" | "mixed", + "chinese_ratio": 0.65, + "chinese_chars": 3500, + "total_chars": 5000, + "sample_pages": 3, + "sample_text_length": 5000 + } + """ + try: + with pdfplumber.open(pdf_path) as pdf: + sample_pages = min(3, len(pdf.pages)) + sample_text = "" + + for i in range(sample_pages): + try: + page_text = pdf.pages[i].extract_text() + if page_text: + sample_text += page_text + "\n" + except: + continue + + # 统计 + chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff']) + total_chars = len([c for c in sample_text if c.strip()]) + + chinese_ratio = chinese_chars / total_chars if total_chars > 0 else 0 + + # 判断语言 + if chinese_ratio > 0.3: + language = 'chinese' + elif chinese_ratio > 0.1: + language = 'mixed' + else: + language = 'english' + + return { + "language": language, + "chinese_ratio": round(chinese_ratio, 4), + "chinese_chars": chinese_chars, + "total_chars": total_chars, + "sample_pages": sample_pages, + "sample_text_length": len(sample_text) + } + + except Exception as e: + logger.error(f"详细语言检测失败: {str(e)}") + return { + "language": "english", + "error": str(e) + } + + +def is_chinese_pdf(pdf_path: str, threshold: float = 0.3) -> bool: + """ + 简单判断是否为中文PDF + + Args: + pdf_path: PDF文件路径 + threshold: 中文字符比例阈值,默认30% + + Returns: + True if 中文字符比例 > threshold + """ + language = detect_language(pdf_path) + return language == 'chinese' + + + + + + diff --git a/extraction_service/services/nougat_extractor.py b/extraction_service/services/nougat_extractor.py new file mode 100644 index 00000000..0fa11c66 --- /dev/null +++ b/extraction_service/services/nougat_extractor.py @@ -0,0 +1,241 @@ +""" +Nougat提取服务 + +使用Nougat OCR提取学术PDF的高质量文本 +保留表格、公式等结构信息 +""" + +import subprocess +import os +from pathlib import Path +from typing import Dict, Any, Optional, Callable +from loguru import logger + + +def check_nougat_available() -> bool: + """ + 检查Nougat是否已安装 + + Returns: + True if Nougat可用 + """ + try: + # 方法1: 尝试导入nougat模块 + import nougat + logger.info(f"Nougat module is available (version: {getattr(nougat, '__version__', 'unknown')})") + return True + except ImportError: + logger.warning("Nougat module not found") + return False + except Exception as e: + logger.error(f"检查Nougat失败: {str(e)}") + return False + + +def extract_pdf_nougat( + file_path: str, + output_dir: Optional[str] = None, + progress_callback: Optional[Callable[[int, int], None]] = None +) -> Dict[str, Any]: + """ + 使用Nougat提取PDF文本 + + Args: + file_path: PDF文件路径 + output_dir: 输出目录,默认为临时目录 + progress_callback: 进度回调函数 (current_page, total_pages) + + Returns: + { + "success": True, + "method": "nougat", + "text": "提取的Markdown文本", + "format": "markdown", + "metadata": { + "page_count": 20, + "char_count": 50000, + "quality_score": 0.95, + "has_tables": True, + "has_formulas": True + } + } + """ + try: + # 检查Nougat是否可用 + if not check_nougat_available(): + raise Exception("Nougat未安装,请先安装:pip install nougat-ocr") + + logger.info(f"开始使用Nougat提取: {file_path}") + + # 准备输出目录 + if output_dir is None: + output_dir = os.path.join(os.path.dirname(file_path), "nougat_output") + + Path(output_dir).mkdir(parents=True, exist_ok=True) + + # 构建Nougat命令 + # nougat命令格式:nougat -o + cmd = [ + 'nougat', + file_path, + '-o', output_dir, + '--markdown', # 输出Markdown格式 + '--no-skipping' # 不跳过任何页面 + ] + + logger.info(f"执行命令: {' '.join(cmd)}") + + # 执行Nougat + # 注意:Nougat可能需要较长时间(1-2分钟/20页) + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + + # 等待完成 + stdout, stderr = process.communicate(timeout=300) # 5分钟超时 + + if process.returncode != 0: + logger.error(f"Nougat执行失败: {stderr}") + raise Exception(f"Nougat执行失败: {stderr}") + + # 读取输出文件 + # Nougat会生成 .mmd 文件 + pdf_name = Path(file_path).stem + output_file = Path(output_dir) / f"{pdf_name}.mmd" + + if not output_file.exists(): + raise Exception(f"Nougat输出文件不存在: {output_file}") + + with open(output_file, 'r', encoding='utf-8') as f: + markdown_text = f.read() + + # 评估质量 + quality_result = evaluate_nougat_quality(markdown_text) + + logger.info(f"Nougat提取完成: 质量={quality_result['quality_score']:.2f}") + + return { + "success": True, + "method": "nougat", + "text": markdown_text, + "format": "markdown", + "metadata": { + "char_count": len(markdown_text), + "quality_score": quality_result['quality_score'], + "has_tables": quality_result['has_tables'], + "has_formulas": quality_result['has_formulas'], + "has_structure": quality_result['has_structure'] + } + } + + except subprocess.TimeoutExpired: + logger.error("Nougat处理超时(>5分钟)") + return { + "success": False, + "error": "处理超时", + "method": "nougat" + } + + except Exception as e: + logger.error(f"Nougat提取失败: {str(e)}") + return { + "success": False, + "error": str(e), + "method": "nougat" + } + + +def evaluate_nougat_quality(text: str) -> Dict[str, Any]: + """ + 评估Nougat提取质量 + + 评分标准: + - 基础分:0.5 + - 有章节结构:+0.2 + - 有表格:+0.15 + - 有公式:+0.15 + - 文本长度充足:+0.1 + - 乱码检测:-0.3 + + Args: + text: Nougat提取的Markdown文本 + + Returns: + { + "quality_score": 0.92, + "has_structure": True, + "has_tables": True, + "has_formulas": True, + "has_garbled": False + } + """ + score = 0.5 # 基础分 + + # 检查章节结构(Markdown标题) + has_structure = bool(text.count('##') >= 2 or text.count('#') >= 3) + if has_structure: + score += 0.2 + + # 检查表格 + has_tables = '|' in text and '---' in text + if has_tables: + score += 0.15 + + # 检查公式(LaTeX格式) + has_formulas = '$$' in text or '$' in text or '\\(' in text + if has_formulas: + score += 0.15 + + # 检查文本长度 + if len(text) > 5000: # 至少5000字符 + score += 0.1 + + # 检查乱码(简单启发式) + # 大量重复字符或特殊符号可能表示乱码 + garbled_chars = sum(1 for c in text if ord(c) > 65535 or c in '��') + has_garbled = garbled_chars > len(text) * 0.05 # 超过5% + if has_garbled: + score -= 0.3 + + # 确保分数在0-1之间 + score = max(0.0, min(1.0, score)) + + return { + "quality_score": score, + "has_structure": has_structure, + "has_tables": has_tables, + "has_formulas": has_formulas, + "has_garbled": has_garbled + } + + +def get_nougat_info() -> Dict[str, Any]: + """ + 获取Nougat信息 + + Returns: + Nougat版本和状态信息 + """ + try: + import nougat + version = getattr(nougat, '__version__', 'unknown') + return { + "available": True, + "version": version + } + + except ImportError: + return { + "available": False, + "error": "Nougat未安装" + } + + except Exception as e: + return { + "available": False, + "error": str(e) + } + diff --git a/extraction_service/services/pdf_extractor.py b/extraction_service/services/pdf_extractor.py new file mode 100644 index 00000000..5c1d823c --- /dev/null +++ b/extraction_service/services/pdf_extractor.py @@ -0,0 +1,191 @@ +""" +PDF文本提取服务 + +使用PyMuPDF (fitz)提取PDF文本内容 +""" + +import fitz # PyMuPDF +from typing import Dict, Any +from loguru import logger + + +def extract_pdf_pymupdf(file_path: str) -> Dict[str, Any]: + """ + 使用PyMuPDF提取PDF文本 + + Args: + file_path: PDF文件路径 + + Returns: + { + "success": True, + "method": "pymupdf", + "text": "提取的文本", + "metadata": { + "page_count": 20, + "char_count": 50000, + "has_text": True + } + } + """ + try: + logger.info(f"开始使用PyMuPDF提取: {file_path}") + + # 打开PDF + doc = fitz.open(file_path) + page_count = len(doc) + + logger.info(f"PDF页数: {page_count}") + + # 提取所有页面的文本 + text_parts = [] + + for page_num in range(page_count): + try: + page = doc[page_num] + text = page.get_text() + + if text.strip(): + # 添加页面分隔符 + text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n") + text_parts.append(text) + + logger.debug(f"第 {page_num + 1} 页提取了 {len(text)} 个字符") + + except Exception as e: + logger.warning(f"第 {page_num + 1} 页提取失败: {str(e)}") + continue + + # 合并文本 + full_text = "".join(text_parts) + char_count = len(full_text) + + # 关闭文档 + doc.close() + + # 检查是否提取到文本 + has_text = char_count > 100 # 至少要有100个字符 + + if not has_text: + logger.warning(f"PDF可能是扫描版或无文本内容") + + logger.info(f"PyMuPDF提取完成: 字符数={char_count}") + + return { + "success": True, + "method": "pymupdf", + "text": full_text, + "format": "plain_text", + "metadata": { + "page_count": page_count, + "char_count": char_count, + "has_text": has_text + } + } + + except Exception as e: + logger.error(f"PyMuPDF提取失败: {str(e)}") + return { + "success": False, + "error": str(e), + "method": "pymupdf" + } + + +def extract_pdf_with_layout(file_path: str) -> Dict[str, Any]: + """ + 使用PyMuPDF提取PDF文本(保留布局) + + Args: + file_path: PDF文件路径 + + Returns: + 提取结果 + """ + try: + logger.info(f"开始使用PyMuPDF提取(保留布局): {file_path}") + + doc = fitz.open(file_path) + page_count = len(doc) + + text_parts = [] + + for page_num in range(page_count): + try: + page = doc[page_num] + + # 使用dict模式提取,可以保留更多格式信息 + blocks = page.get_text("dict")["blocks"] + + page_text = [] + + for block in blocks: + if block["type"] == 0: # 文本块 + for line in block.get("lines", []): + for span in line.get("spans", []): + text = span.get("text", "") + if text.strip(): + page_text.append(text) + + if page_text: + text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n") + text_parts.append(" ".join(page_text)) + + except Exception as e: + logger.warning(f"第 {page_num + 1} 页处理失败: {str(e)}") + continue + + full_text = "".join(text_parts) + doc.close() + + return { + "success": True, + "method": "pymupdf_layout", + "text": full_text, + "format": "plain_text", + "metadata": { + "page_count": page_count, + "char_count": len(full_text) + } + } + + except Exception as e: + logger.error(f"PyMuPDF布局提取失败: {str(e)}") + return { + "success": False, + "error": str(e) + } + + +def get_pdf_metadata(file_path: str) -> Dict[str, Any]: + """ + 获取PDF元数据 + + Args: + file_path: PDF文件路径 + + Returns: + PDF元数据 + """ + try: + doc = fitz.open(file_path) + + metadata = { + "page_count": len(doc), + "metadata": doc.metadata, + "is_encrypted": doc.is_encrypted, + "is_pdf": doc.is_pdf + } + + doc.close() + return metadata + + except Exception as e: + logger.error(f"获取PDF元数据失败: {str(e)}") + return {} + + + + + + diff --git a/extraction_service/services/pdf_processor.py b/extraction_service/services/pdf_processor.py new file mode 100644 index 00000000..9754c99f --- /dev/null +++ b/extraction_service/services/pdf_processor.py @@ -0,0 +1,192 @@ +""" +PDF处理主服务 + +实现顺序降级策略: +1. 检测语言 +2. 中文PDF → PyMuPDF(快速) +3. 英文PDF → Nougat → 失败降级PyMuPDF +""" + +from typing import Dict, Any, Optional +from loguru import logger + +from .language_detector import detect_language +from .nougat_extractor import extract_pdf_nougat, check_nougat_available +from .pdf_extractor import extract_pdf_pymupdf + + +def extract_pdf( + file_path: str, + force_method: Optional[str] = None +) -> Dict[str, Any]: + """ + PDF提取主函数(顺序降级策略) + + 处理流程: + 1. 检测语言 + 2. 中文 → 直接PyMuPDF + 3. 英文 → 尝试Nougat → 失败降级PyMuPDF + + Args: + file_path: PDF文件路径 + force_method: 强制使用的方法 ('nougat' | 'pymupdf') + + Returns: + { + "success": True, + "method": "nougat" | "pymupdf", + "reason": "chinese_pdf" | "english_pdf" | "nougat_failed" | "nougat_low_quality", + "text": "提取的文本", + "metadata": {...} + } + """ + try: + logger.info(f"开始处理PDF: {file_path}") + + # Step 1: 语言检测 + logger.info("[Step 1] 检测PDF语言...") + language = detect_language(file_path) + logger.info(f"检测结果: {language}") + + # 如果强制指定方法 + if force_method: + logger.info(f"强制使用方法: {force_method}") + + if force_method == 'nougat': + return extract_pdf_nougat(file_path) + elif force_method == 'pymupdf': + result = extract_pdf_pymupdf(file_path) + result['reason'] = 'force_pymupdf' + return result + + # Step 2: 中文PDF → 直接PyMuPDF + if language == 'chinese': + logger.info("[Step 2] 中文PDF,使用PyMuPDF快速处理") + + result = extract_pdf_pymupdf(file_path) + + if result['success']: + result['reason'] = 'chinese_pdf' + result['detected_language'] = language + logger.info("✅ PyMuPDF处理成功(中文PDF)") + return result + else: + logger.error("❌ PyMuPDF处理失败") + return result + + # Step 3: 英文PDF → 尝试Nougat + logger.info("[Step 3] 英文PDF,尝试Nougat高质量解析") + + # 检查Nougat是否可用 + if not check_nougat_available(): + logger.warning("⚠️ Nougat不可用,降级到PyMuPDF") + + result = extract_pdf_pymupdf(file_path) + if result['success']: + result['reason'] = 'nougat_unavailable' + result['detected_language'] = language + return result + + # 尝试Nougat + try: + nougat_result = extract_pdf_nougat(file_path) + + if not nougat_result['success']: + logger.warning("⚠️ Nougat提取失败,降级到PyMuPDF") + raise Exception(nougat_result.get('error', 'Nougat failed')) + + # 质量检查 + quality_score = nougat_result['metadata'].get('quality_score', 0) + + logger.info(f"Nougat质量评分: {quality_score:.2f}") + + # 质量阈值:0.7 + if quality_score >= 0.7: + logger.info("✅ Nougat处理成功(质量合格)") + nougat_result['reason'] = 'english_pdf_high_quality' + nougat_result['detected_language'] = language + return nougat_result + else: + logger.warning(f"⚠️ Nougat质量不足: {quality_score:.2f},降级到PyMuPDF") + raise Exception(f"Quality too low: {quality_score}") + + except Exception as e: + logger.warning(f"Nougat处理失败: {str(e)},降级到PyMuPDF") + + # Step 4: 降级到PyMuPDF + logger.info("[Step 4] 降级使用PyMuPDF") + + result = extract_pdf_pymupdf(file_path) + + if result['success']: + result['reason'] = 'nougat_failed_or_low_quality' + result['detected_language'] = language + result['fallback'] = True + logger.info("✅ PyMuPDF处理成功(降级方案)") + else: + logger.error("❌ PyMuPDF处理也失败了") + + return result + + except Exception as e: + logger.error(f"PDF处理完全失败: {str(e)}") + return { + "success": False, + "error": str(e), + "method": "unknown" + } + + +def get_pdf_processing_strategy(file_path: str) -> Dict[str, Any]: + """ + 获取PDF处理策略(不实际提取) + + 用于预览将使用哪种方法 + + Args: + file_path: PDF文件路径 + + Returns: + { + "detected_language": "chinese" | "english", + "recommended_method": "nougat" | "pymupdf", + "reason": "...", + "nougat_available": True | False + } + """ + try: + # 检测语言 + language = detect_language(file_path) + + # 检查Nougat可用性 + nougat_available = check_nougat_available() + + # 决定策略 + if language == 'chinese': + recommended_method = 'pymupdf' + reason = '中文PDF,推荐使用PyMuPDF快速处理' + elif nougat_available: + recommended_method = 'nougat' + reason = '英文PDF,推荐使用Nougat高质量解析' + else: + recommended_method = 'pymupdf' + reason = 'Nougat不可用,使用PyMuPDF' + + return { + "detected_language": language, + "recommended_method": recommended_method, + "reason": reason, + "nougat_available": nougat_available + } + + except Exception as e: + logger.error(f"获取处理策略失败: {str(e)}") + return { + "error": str(e) + } + + + + + + diff --git a/extraction_service/services/txt_extractor.py b/extraction_service/services/txt_extractor.py new file mode 100644 index 00000000..b4e860e3 --- /dev/null +++ b/extraction_service/services/txt_extractor.py @@ -0,0 +1,320 @@ +""" +Txt文本文件提取服务 + +直接读取纯文本文件 +支持多种编码自动检测 +""" + +from pathlib import Path +from typing import Dict, Any, List +from loguru import logger +import chardet + + +def extract_txt(file_path: str) -> Dict[str, Any]: + """ + 提取Txt文件内容 + + 特性: + - 自动检测编码(UTF-8, GBK, GB2312等) + - 支持大文件(逐块读取) + - 去除BOM标记 + + Args: + file_path: Txt文件路径 + + Returns: + { + "success": True, + "text": "文本内容", + "encoding": "检测到的编码", + "metadata": { + "char_count": 字符数, + "line_count": 行数, + "file_size": 文件大小 + } + } + """ + try: + file_path_obj = Path(file_path) + + # 验证文件存在 + if not file_path_obj.exists(): + return { + "success": False, + "error": f"文件不存在: {file_path}", + "text": "", + "metadata": {} + } + + # 验证文件格式 + if file_path_obj.suffix.lower() != '.txt': + return { + "success": False, + "error": f"不支持的文件格式: {file_path_obj.suffix},仅支持.txt", + "text": "", + "metadata": {} + } + + file_size = file_path_obj.stat().st_size + + # 空文件检查 + if file_size == 0: + return { + "success": False, + "error": "文件为空", + "text": "", + "metadata": { + "char_count": 0, + "line_count": 0, + "file_size": 0 + } + } + + logger.info(f"开始提取Txt文件: {file_path_obj.name} ({file_size / 1024:.2f} KB)") + + # 检测编码 + detected_encoding = detect_encoding(file_path) + logger.info(f"检测到编码: {detected_encoding}") + + # 读取文件(带编码回退) + text, actual_encoding = read_with_fallback(file_path, detected_encoding) + + if text is None: + return { + "success": False, + "error": "无法解码文件,尝试了多种编码均失败", + "text": "", + "metadata": {} + } + + # 统计信息 + char_count = len(text) + line_count = text.count('\n') + 1 + + logger.info(f"Txt提取成功: {char_count}个字符, {line_count}行") + + return { + "success": True, + "text": text, + "encoding": actual_encoding, + "metadata": { + "char_count": char_count, + "line_count": line_count, + "file_size": file_size, + "size_kb": round(file_size / 1024, 2) + } + } + + except Exception as e: + logger.error(f"Txt提取失败: {str(e)}") + return { + "success": False, + "error": str(e), + "text": "", + "metadata": {} + } + + +def detect_encoding(file_path: str, sample_size: int = 10000) -> str: + """ + 检测文件编码 + + Args: + file_path: 文件路径 + sample_size: 采样大小(字节) + + Returns: + 检测到的编码名称 + """ + try: + with open(file_path, 'rb') as f: + raw_data = f.read(sample_size) + + # 使用chardet检测 + result = chardet.detect(raw_data) + encoding = result['encoding'] + confidence = result['confidence'] + + logger.debug(f"编码检测: {encoding} (置信度: {confidence:.2f})") + + # 如果置信度太低,使用UTF-8作为默认 + if confidence < 0.7: + logger.warning(f"编码置信度较低({confidence:.2f}),将尝试UTF-8") + return 'utf-8' + + return encoding if encoding else 'utf-8' + + except Exception as e: + logger.warning(f"编码检测失败: {str(e)},使用UTF-8") + return 'utf-8' + + +def read_with_fallback(file_path: str, primary_encoding: str) -> tuple[str, str]: + """ + 尝试多种编码读取文件 + + Args: + file_path: 文件路径 + primary_encoding: 首选编码 + + Returns: + (文本内容, 实际使用的编码) + """ + # 编码尝试列表(按优先级) + encodings = [ + primary_encoding, + 'utf-8', + 'utf-8-sig', # UTF-8 with BOM + 'gbk', + 'gb2312', + 'gb18030', + 'latin-1', + 'cp1252', + 'iso-8859-1' + ] + + # 去重并保持顺序 + seen = set() + unique_encodings = [] + for enc in encodings: + if enc and enc.lower() not in seen: + seen.add(enc.lower()) + unique_encodings.append(enc) + + # 尝试每种编码 + for encoding in unique_encodings: + try: + with open(file_path, 'r', encoding=encoding, errors='strict') as f: + text = f.read() + + logger.info(f"成功使用编码: {encoding}") + return text, encoding + + except UnicodeDecodeError: + logger.debug(f"编码 {encoding} 解码失败,尝试下一个") + continue + + except Exception as e: + logger.warning(f"使用编码 {encoding} 读取失败: {str(e)}") + continue + + # 所有编码都失败 + logger.error("所有编码尝试均失败") + return None, None + + +def validate_txt_file(file_path: str) -> Dict[str, Any]: + """ + 验证Txt文件的有效性 + + Args: + file_path: 文件路径 + + Returns: + { + "valid": True/False, + "reason": "原因", + "file_info": {文件信息} + } + """ + try: + file_path_obj = Path(file_path) + + # 检查文件存在 + if not file_path_obj.exists(): + return { + "valid": False, + "reason": "文件不存在" + } + + # 检查后缀 + if file_path_obj.suffix.lower() != '.txt': + return { + "valid": False, + "reason": f"不支持的格式: {file_path_obj.suffix}(仅支持.txt)" + } + + # 检查文件大小(限制10MB,txt文件通常较小) + file_size = file_path_obj.stat().st_size + max_size = 10 * 1024 * 1024 # 10MB + + if file_size > max_size: + return { + "valid": False, + "reason": f"文件过大: {file_size / 1024 / 1024:.2f}MB(限制10MB)" + } + + if file_size == 0: + return { + "valid": False, + "reason": "文件为空" + } + + # 尝试检测编码 + encoding = detect_encoding(str(file_path_obj)) + + return { + "valid": True, + "reason": "文件有效", + "file_info": { + "filename": file_path_obj.name, + "size": file_size, + "size_kb": round(file_size / 1024, 2), + "detected_encoding": encoding + } + } + + except Exception as e: + return { + "valid": False, + "reason": f"验证失败: {str(e)}" + } + + +def preview_txt(file_path: str, lines: int = 10) -> Dict[str, Any]: + """ + 预览Txt文件前几行 + + Args: + file_path: 文件路径 + lines: 预览行数 + + Returns: + { + "success": True, + "preview": "前N行内容", + "total_lines": 总行数(如果能快速获取) + } + """ + try: + result = extract_txt(file_path) + + if not result['success']: + return result + + text = result['text'] + text_lines = text.split('\n') + + preview_lines = text_lines[:lines] + preview = '\n'.join(preview_lines) + + return { + "success": True, + "preview": preview, + "total_lines": len(text_lines), + "preview_lines": len(preview_lines) + } + + except Exception as e: + return { + "success": False, + "error": str(e), + "preview": "" + } + + + + + + diff --git a/extraction_service/start.bat b/extraction_service/start.bat new file mode 100644 index 00000000..4a1781ef --- /dev/null +++ b/extraction_service/start.bat @@ -0,0 +1,37 @@ +@echo off +chcp 65001 >nul +echo ================================ +echo 启动文档提取微服务 +echo ================================ +echo. + +REM 检查虚拟环境 +if exist venv\Scripts\activate.bat ( + echo [1/3] 激活虚拟环境... + call venv\Scripts\activate +) else ( + echo 警告: 未找到虚拟环境,使用全局Python +) + +REM 检查依赖 +echo [2/3] 检查依赖... +pip list | findstr "fastapi" >nul +if errorlevel 1 ( + echo 依赖未安装,正在安装... + pip install -r requirements.txt +) + +REM 启动服务 +echo [3/3] 启动服务... +echo. +echo 服务地址: http://localhost:8000 +echo 健康检查: http://localhost:8000/api/health +echo API文档: http://localhost:8000/docs +echo. +echo 按 Ctrl+C 停止服务 +echo. + +uvicorn main:app --host 0.0.0.0 --port 8000 --reload + +pause + diff --git a/extraction_service/test_files/test.txt b/extraction_service/test_files/test.txt new file mode 100644 index 00000000..2dd4c91c --- /dev/null +++ b/extraction_service/test_files/test.txt @@ -0,0 +1,29 @@ +这是一个测试文本文件。 +用于测试Txt文件提取功能。 + +AI临床研究平台 - Phase 2 Day 3测试 + +功能特点: +1. 自动编码检测 +2. 支持UTF-8、GBK等多种编码 +3. 统计字符数和行数 +4. 快速文本提取 + +测试内容包含: +- 中文字符 +- 英文字符 (English characters) +- 数字 123456 +- 特殊符号 !@#$%^&*() + +多行文本测试: +第一行 +第二行 +第三行 + +结束。 + + + + + + diff --git a/extraction_service/test_service.py b/extraction_service/test_service.py new file mode 100644 index 00000000..ba67860c --- /dev/null +++ b/extraction_service/test_service.py @@ -0,0 +1,171 @@ +""" +服务测试脚本 + +测试文档提取微服务的各项功能 +""" + +import requests +import sys +from pathlib import Path + + +BASE_URL = "http://localhost:8000" + + +def test_health(): + """测试健康检查""" + print("\n" + "="*50) + print("测试1: 健康检查") + print("="*50) + + try: + response = requests.get(f"{BASE_URL}/api/health") + print(f"状态码: {response.status_code}") + + if response.status_code == 200: + data = response.json() + print(f"服务状态: {data['status']}") + print(f"PyMuPDF: {data['checks']['pymupdf']['available']} (v{data['checks']['pymupdf']['version']})") + print(f"临时目录: {data['checks']['temp_dir']['path']}") + print("✅ 健康检查通过") + return True + else: + print("❌ 健康检查失败") + return False + except Exception as e: + print(f"❌ 连接失败: {str(e)}") + print("提示: 请确保服务已启动(python main.py)") + return False + + +def test_pdf_extraction(pdf_file: str = None): + """测试PDF提取""" + print("\n" + "="*50) + print("测试2: PDF文本提取") + print("="*50) + + if not pdf_file: + print("跳过: 未提供测试PDF文件") + print("使用方法: python test_service.py ") + return None + + pdf_path = Path(pdf_file) + + if not pdf_path.exists(): + print(f"❌ 文件不存在: {pdf_file}") + return False + + try: + print(f"上传文件: {pdf_path.name}") + print(f"文件大小: {pdf_path.stat().st_size / 1024:.2f} KB") + + with open(pdf_path, 'rb') as f: + files = {'file': (pdf_path.name, f, 'application/pdf')} + response = requests.post( + f"{BASE_URL}/api/extract/pdf", + files=files + ) + + print(f"状态码: {response.status_code}") + + if response.status_code == 200: + data = response.json() + + print("\n提取结果:") + print(f"方法: {data['method']}") + print(f"页数: {data['metadata']['page_count']}") + print(f"字符数: {data['metadata']['char_count']}") + print(f"文本长度: {len(data['text'])} 字符") + + # 显示前500字符 + print("\n文本预览:") + print("-" * 50) + print(data['text'][:500]) + if len(data['text']) > 500: + print("...") + print("-" * 50) + + print("\n✅ PDF提取成功") + return True + else: + print(f"❌ 提取失败: {response.text}") + return False + + except Exception as e: + print(f"❌ 请求失败: {str(e)}") + return False + + +def test_root(): + """测试根路径""" + print("\n" + "="*50) + print("测试0: 根路径") + print("="*50) + + try: + response = requests.get(f"{BASE_URL}/") + print(f"状态码: {response.status_code}") + + if response.status_code == 200: + data = response.json() + print(f"服务: {data['service']}") + print(f"版本: {data['version']}") + print("✅ 根路径正常") + return True + else: + print("❌ 根路径异常") + return False + except Exception as e: + print(f"❌ 连接失败: {str(e)}") + return False + + +def main(): + """主测试函数""" + print("\n" + "="*50) + print("文档提取微服务 - 测试套件") + print("="*50) + + # 获取PDF文件路径(如果提供) + pdf_file = sys.argv[1] if len(sys.argv) > 1 else None + + # 运行测试 + results = [] + + results.append(("根路径", test_root())) + results.append(("健康检查", test_health())) + results.append(("PDF提取", test_pdf_extraction(pdf_file))) + + # 总结 + print("\n" + "="*50) + print("测试总结") + print("="*50) + + for name, result in results: + if result is True: + status = "✅ 通过" + elif result is False: + status = "❌ 失败" + else: + status = "⏭️ 跳过" + print(f"{name}: {status}") + + passed = sum(1 for _, r in results if r is True) + total = len([r for _, r in results if r is not None]) + + print(f"\n通过率: {passed}/{total}") + + if passed == total: + print("\n🎉 所有测试通过!") + else: + print("\n⚠️ 部分测试失败") + + +if __name__ == "__main__": + main() + + + + + +