diff --git a/.gitignore b/.gitignore
index 28bb1399..da1252d8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,6 +48,78 @@ tmp/
temp/
*.tmp
+# ==================== Python ====================
+# Virtual environments (重要!避免提交 2+ GB 的依赖)
+venv/
+env/
+.venv/
+ENV/
+env.bak/
+venv.bak/
+# Python cache
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+# PyInstaller
+*.manifest
+*.spec
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# Celery
+celerybeat-schedule
+celerybeat.pid
+
+# Environments
+.env
+.venv
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Unit test / coverage
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre
+.pyre/
+
+# pytype
+.pytype/
+
+# Cython
+cython_debug/
diff --git a/extraction_service/.gitignore b/extraction_service/.gitignore
new file mode 100644
index 00000000..45804457
--- /dev/null
+++ b/extraction_service/.gitignore
@@ -0,0 +1,40 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+ENV/
+build/
+dist/
+*.egg-info/
+
+# 环境变量
+.env
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# 临时文件
+/tmp/
+*.log
+
+# 测试
+.pytest_cache/
+.coverage
+htmlcov/
+
+# OS
+.DS_Store
+Thumbs.db
+
+
+
+
+
+
diff --git a/extraction_service/README.md b/extraction_service/README.md
new file mode 100644
index 00000000..0f8a5753
--- /dev/null
+++ b/extraction_service/README.md
@@ -0,0 +1,181 @@
+# 文档提取微服务
+
+基于FastAPI的文档文本提取服务,支持PDF、Docx、Txt格式。
+
+## 功能特性
+
+- ✅ **PDF提取**:使用PyMuPDF快速提取PDF文本
+- ⏳ **Docx提取**:使用Mammoth提取Word文档(Day 3)
+- ⏳ **Txt提取**:支持多种编码(Day 3)
+- ⏳ **语言检测**:自动检测PDF语言(Day 2)
+- ⏳ **Nougat集成**:高质量学术PDF解析(Day 2)
+
+## 快速开始
+
+### 1. 安装依赖
+
+```bash
+cd extraction_service
+
+# 创建虚拟环境(推荐)
+python -m venv venv
+source venv/bin/activate # Windows: venv\Scripts\activate
+
+# 安装依赖
+pip install -r requirements.txt
+```
+
+### 2. 配置环境变量
+
+```bash
+# 复制示例配置
+cp .env.example .env
+
+# 编辑配置(可选)
+# SERVICE_PORT=8000
+# DEBUG=True
+```
+
+### 3. 启动服务
+
+```bash
+# 开发模式(自动重载)
+python main.py
+
+# 或使用uvicorn
+uvicorn main:app --reload --port 8000
+```
+
+服务将在 http://localhost:8000 启动
+
+### 4. 测试服务
+
+#### 健康检查
+
+```bash
+curl http://localhost:8000/api/health
+```
+
+返回:
+```json
+{
+ "status": "healthy",
+ "checks": {
+ "pymupdf": {
+ "available": true,
+ "version": "1.23.8"
+ },
+ "temp_dir": {
+ "path": "/tmp/extraction_service",
+ "writable": true
+ }
+ }
+}
+```
+
+#### PDF文本提取
+
+```bash
+curl -X POST http://localhost:8000/api/extract/pdf \
+ -F "file=@test.pdf"
+```
+
+返回:
+```json
+{
+ "success": true,
+ "method": "pymupdf",
+ "text": "提取的文本内容...",
+ "metadata": {
+ "page_count": 20,
+ "char_count": 50000,
+ "file_size": 1024000,
+ "filename": "test.pdf"
+ }
+}
+```
+
+## API文档
+
+启动服务后访问:
+- Swagger UI: http://localhost:8000/docs
+- ReDoc: http://localhost:8000/redoc
+
+## 项目结构
+
+```
+extraction_service/
+├── main.py # 主应用入口
+├── requirements.txt # Python依赖
+├── .env.example # 环境变量示例
+├── README.md # 本文件
+├── services/ # 服务模块
+│ ├── __init__.py
+│ ├── pdf_extractor.py # PDF提取(PyMuPDF)
+│ ├── nougat_extractor.py # Nougat提取(Day 2)
+│ ├── docx_extractor.py # Docx提取(Day 3)
+│ ├── txt_extractor.py # Txt提取(Day 3)
+│ ├── language_detector.py # 语言检测(Day 2)
+│ └── file_utils.py # 文件工具
+└── tests/ # 测试文件(待添加)
+```
+
+## 开发计划
+
+### ✅ Day 1(已完成)
+- [x] FastAPI项目搭建
+- [x] PyMuPDF集成
+- [x] PDF文本提取功能
+- [x] 健康检查API
+
+### ⏳ Day 2(进行中)
+- [ ] 安装Nougat
+- [ ] 语言检测功能
+- [ ] Nougat提取逻辑
+- [ ] 顺序降级机制
+
+### ⏳ Day 3
+- [ ] Docx提取(Mammoth)
+- [ ] Txt提取(多编码)
+- [ ] 文件格式验证
+
+## 依赖说明
+
+| 库 | 版本 | 用途 |
+|---|---|---|
+| fastapi | 0.104.1 | Web框架 |
+| uvicorn | 0.24.0 | ASGI服务器 |
+| PyMuPDF | 1.23.8 | PDF文本提取 |
+| pdfplumber | 0.10.3 | PDF语言检测 |
+| mammoth | 1.6.0 | Docx提取 |
+| langdetect | 1.0.9 | 语言检测 |
+| loguru | 0.7.2 | 日志管理 |
+
+## 性能指标
+
+| 操作 | 目标时间 |
+|---|---|
+| 20页PDF(PyMuPDF) | <30秒 |
+| 10页Docx | <10秒 |
+| 1MB Txt | <5秒 |
+
+## 常见问题
+
+### Q: PyMuPDF安装失败?
+A: 确保Python版本>=3.8,使用pip安装:`pip install PyMuPDF`
+
+### Q: 服务无法启动?
+A: 检查端口8000是否被占用,可修改.env中的SERVICE_PORT
+
+### Q: 临时文件在哪里?
+A: 默认在/tmp/extraction_service目录,可通过TEMP_DIR环境变量配置
+
+## License
+
+MIT
+
+
+
+
+
+
diff --git a/extraction_service/install.bat b/extraction_service/install.bat
new file mode 100644
index 00000000..51376170
--- /dev/null
+++ b/extraction_service/install.bat
@@ -0,0 +1,89 @@
+@echo off
+chcp 65001 >nul
+echo ================================
+echo 安装文档提取微服务依赖
+echo ================================
+echo.
+
+REM 检查Python
+echo [1/5] 检查Python环境...
+python --version >nul 2>&1
+if errorlevel 1 (
+ echo ❌ 错误: 未找到Python
+ echo 请先安装Python 3.8或更高版本
+ echo 下载地址: https://www.python.org/downloads/
+ pause
+ exit /b 1
+)
+
+python --version
+echo ✅ Python已安装
+echo.
+
+REM 创建虚拟环境
+echo [2/5] 创建虚拟环境...
+if exist venv (
+ echo 虚拟环境已存在,跳过创建
+) else (
+ python -m venv venv
+ if errorlevel 1 (
+ echo ❌ 创建虚拟环境失败
+ pause
+ exit /b 1
+ )
+ echo ✅ 虚拟环境创建成功
+)
+echo.
+
+REM 激活虚拟环境
+echo [3/5] 激活虚拟环境...
+call venv\Scripts\activate
+if errorlevel 1 (
+ echo ❌ 激活虚拟环境失败
+ pause
+ exit /b 1
+)
+echo ✅ 虚拟环境已激活
+echo.
+
+REM 升级pip
+echo [4/5] 升级pip...
+python -m pip install --upgrade pip
+echo.
+
+REM 安装依赖
+echo [5/5] 安装依赖包...
+echo 这可能需要几分钟时间...
+pip install -r requirements.txt
+if errorlevel 1 (
+ echo ❌ 依赖安装失败
+ pause
+ exit /b 1
+)
+echo.
+
+REM 验证安装
+echo ================================
+echo 验证安装
+echo ================================
+python -c "import fastapi; print('✅ FastAPI:', fastapi.__version__)"
+python -c "import fitz; print('✅ PyMuPDF:', fitz.__version__)"
+python -c "import uvicorn; print('✅ Uvicorn: OK')"
+echo.
+
+echo ================================
+echo 🎉 安装完成!
+echo ================================
+echo.
+echo 下一步:
+echo 1. 启动服务: start.bat
+echo 2. 测试服务: python test_service.py
+echo.
+
+pause
+
+
+
+
+
+
diff --git a/extraction_service/install_nougat.bat b/extraction_service/install_nougat.bat
new file mode 100644
index 00000000..c005d9b6
--- /dev/null
+++ b/extraction_service/install_nougat.bat
@@ -0,0 +1,88 @@
+@echo off
+chcp 65001 >nul
+echo ================================
+echo 安装Nougat OCR
+echo ================================
+echo.
+
+echo ⚠️ 注意事项:
+echo 1. Nougat需要Python 3.8+
+echo 2. 首次运行会下载模型文件(约350MB)
+echo 3. 建议使用GPU加速(需CUDA)
+echo 4. 安装可能需要5-10分钟
+echo.
+pause
+
+REM 激活虚拟环境
+if exist venv\Scripts\activate.bat (
+ echo [1/4] 激活虚拟环境...
+ call venv\Scripts\activate
+) else (
+ echo 错误: 请先运行 install.bat 创建虚拟环境
+ pause
+ exit /b 1
+)
+
+REM 安装Nougat
+echo.
+echo [2/4] 安装Nougat OCR...
+echo 这可能需要几分钟时间...
+echo.
+
+pip install nougat-ocr==0.1.17
+
+if errorlevel 1 (
+ echo.
+ echo ❌ Nougat安装失败
+ echo.
+ echo 可能的原因:
+ echo 1. 网络问题:请使用国内镜像源
+ echo 2. Python版本:需要Python 3.8+
+ echo 3. 依赖冲突:可能需要新的虚拟环境
+ echo.
+ echo 替代方案:
+ echo - 如果只使用中文PDF,可以不安装Nougat
+ echo - 系统会自动降级使用PyMuPDF
+ echo.
+ pause
+ exit /b 1
+)
+
+echo.
+echo [3/4] 验证安装...
+python -c "import nougat; print('✅ Nougat导入成功')"
+
+echo.
+echo [4/4] 测试Nougat命令...
+nougat --version
+
+if errorlevel 1 (
+ echo ⚠️ 命令行工具未找到,但Python模块已安装
+ echo 这可能不影响使用,系统会尝试直接调用Python模块
+) else (
+ echo ✅ Nougat命令行工具正常
+)
+
+echo.
+echo ================================
+echo 🎉 Nougat安装完成!
+echo ================================
+echo.
+echo 说明:
+echo - Nougat擅长处理英文学术PDF
+echo - 能保留表格、公式等结构
+echo - 中文PDF会自动使用PyMuPDF
+echo - 首次使用会下载模型(约350MB)
+echo.
+echo 下一步:
+echo - 启动服务: start.bat
+echo - 健康检查: curl http://localhost:8000/api/health
+echo.
+
+pause
+
+
+
+
+
+
diff --git a/extraction_service/main.py b/extraction_service/main.py
new file mode 100644
index 00000000..5e6e6b83
--- /dev/null
+++ b/extraction_service/main.py
@@ -0,0 +1,508 @@
+"""
+文档提取微服务 - 主入口
+
+功能:
+- PDF文本提取(PyMuPDF)
+- Docx文本提取(Mammoth)
+- Txt文本提取(直接读取)
+- 语言检测
+- 健康检查
+"""
+
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from loguru import logger
+from pathlib import Path
+import os
+import sys
+from datetime import datetime
+from dotenv import load_dotenv
+
+# 加载环境变量
+load_dotenv()
+
+# 配置日志
+logger.remove()
+logger.add(
+ sys.stdout,
+ format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}",
+ level=os.getenv("LOG_LEVEL", "INFO")
+)
+
+# 创建FastAPI应用
+app = FastAPI(
+ title="文档提取微服务",
+ description="提供PDF、Docx、Txt文档的文本提取服务",
+ version="1.0.0",
+)
+
+# CORS配置
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"], # 生产环境应该限制具体域名
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
+
+# 临时文件目录
+TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service"))
+TEMP_DIR.mkdir(parents=True, exist_ok=True)
+
+# 导入服务模块
+from services.pdf_extractor import extract_pdf_pymupdf
+from services.pdf_processor import extract_pdf, get_pdf_processing_strategy
+from services.language_detector import detect_language, detect_language_detailed
+from services.nougat_extractor import check_nougat_available, get_nougat_info
+from services.file_utils import detect_file_type, cleanup_temp_file
+from services.docx_extractor import extract_docx_mammoth, validate_docx_file
+from services.txt_extractor import extract_txt, validate_txt_file
+
+
+# ==================== API路由 ====================
+
+@app.get("/")
+async def root():
+ """根路径"""
+ return {
+ "service": "文档提取微服务",
+ "version": "1.0.0",
+ "status": "running"
+ }
+
+
+@app.get("/api/health")
+async def health_check():
+ """
+ 健康检查接口
+
+ 检查项:
+ - 服务是否运行
+ - PyMuPDF是否可用
+ - Nougat是否可用
+ - 临时目录是否可写
+ """
+ try:
+ import fitz # PyMuPDF
+ pymupdf_version = fitz.__version__
+ pymupdf_available = True
+ except Exception as e:
+ pymupdf_version = "unknown"
+ pymupdf_available = False
+ logger.warning(f"PyMuPDF不可用: {str(e)}")
+
+ # 检查Nougat
+ nougat_info = get_nougat_info()
+
+ # 检查临时目录
+ temp_dir_writable = TEMP_DIR.exists() and os.access(TEMP_DIR, os.W_OK)
+
+ return {
+ "status": "healthy" if (pymupdf_available and temp_dir_writable) else "degraded",
+ "checks": {
+ "pymupdf": {
+ "available": pymupdf_available,
+ "version": pymupdf_version
+ },
+ "nougat": nougat_info,
+ "temp_dir": {
+ "path": str(TEMP_DIR),
+ "writable": temp_dir_writable
+ }
+ },
+ "timestamp": datetime.now().isoformat()
+ }
+
+
+@app.post("/api/extract/pdf")
+async def extract_pdf_endpoint(
+ file: UploadFile = File(...),
+ method: str = "auto"
+):
+ """
+ PDF文本提取接口(智能选择方法)
+
+ Args:
+ file: 上传的PDF文件
+ method: 提取方法 ('auto' | 'nougat' | 'pymupdf')
+ - auto: 自动选择(默认)
+ - nougat: 强制使用Nougat
+ - pymupdf: 强制使用PyMuPDF
+
+ Returns:
+ {
+ "success": true,
+ "method": "nougat" | "pymupdf",
+ "reason": "...",
+ "text": "提取的文本内容",
+ "metadata": {...}
+ }
+ """
+ temp_path = None
+
+ try:
+ # 验证文件类型
+ if not file.filename.lower().endswith('.pdf'):
+ raise HTTPException(
+ status_code=400,
+ detail="文件格式错误,只支持PDF文件"
+ )
+
+ # 保存临时文件
+ temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
+
+ logger.info(f"开始处理PDF文件: {file.filename}, 方法={method}")
+
+ with open(temp_path, "wb") as f:
+ content = await file.read()
+ f.write(content)
+
+ file_size = len(content)
+ logger.info(f"文件大小: {file_size / 1024:.2f} KB")
+
+ # 提取文本(使用顺序降级策略)
+ force_method = None if method == "auto" else method
+ result = extract_pdf(str(temp_path), force_method=force_method)
+
+ if not result["success"]:
+ raise HTTPException(
+ status_code=500,
+ detail=f"PDF提取失败: {result.get('error', 'Unknown error')}"
+ )
+
+ # 添加文件元数据
+ result["metadata"]["file_size"] = file_size
+ result["metadata"]["filename"] = file.filename
+
+ logger.info(f"PDF提取成功: {file.filename}, "
+ f"方法={result['method']}, "
+ f"原因={result.get('reason', 'N/A')}")
+
+ return JSONResponse(content=result)
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"PDF提取失败: {str(e)}")
+ raise HTTPException(
+ status_code=500,
+ detail=f"处理失败: {str(e)}"
+ )
+
+ finally:
+ # 清理临时文件
+ if temp_path:
+ cleanup_temp_file(temp_path)
+
+
+@app.post("/api/detect-language")
+async def detect_language_endpoint(file: UploadFile = File(...)):
+ """
+ PDF语言检测接口
+
+ Args:
+ file: 上传的PDF文件
+
+ Returns:
+ {
+ "language": "chinese" | "english" | "mixed",
+ "chinese_ratio": 0.65,
+ "chinese_chars": 3500,
+ "total_chars": 5000
+ }
+ """
+ temp_path = None
+
+ try:
+ if not file.filename.lower().endswith('.pdf'):
+ raise HTTPException(status_code=400, detail="只支持PDF文件")
+
+ # 保存临时文件
+ temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
+
+ with open(temp_path, "wb") as f:
+ content = await file.read()
+ f.write(content)
+
+ # 检测语言
+ result = detect_language_detailed(str(temp_path))
+ result["filename"] = file.filename
+
+ return JSONResponse(content=result)
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"语言检测失败: {str(e)}")
+ raise HTTPException(status_code=500, detail=f"检测失败: {str(e)}")
+
+ finally:
+ if temp_path:
+ cleanup_temp_file(temp_path)
+
+
+@app.post("/api/pdf-strategy")
+async def get_strategy_endpoint(file: UploadFile = File(...)):
+ """
+ 获取PDF处理策略(不实际提取)
+
+ Args:
+ file: 上传的PDF文件
+
+ Returns:
+ {
+ "detected_language": "chinese" | "english",
+ "recommended_method": "nougat" | "pymupdf",
+ "reason": "...",
+ "nougat_available": true
+ }
+ """
+ temp_path = None
+
+ try:
+ if not file.filename.lower().endswith('.pdf'):
+ raise HTTPException(status_code=400, detail="只支持PDF文件")
+
+ temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
+
+ with open(temp_path, "wb") as f:
+ content = await file.read()
+ f.write(content)
+
+ # 获取处理策略
+ result = get_pdf_processing_strategy(str(temp_path))
+ result["filename"] = file.filename
+
+ return JSONResponse(content=result)
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"获取策略失败: {str(e)}")
+ raise HTTPException(status_code=500, detail=f"失败: {str(e)}")
+
+ finally:
+ if temp_path:
+ cleanup_temp_file(temp_path)
+
+
+@app.post("/api/extract/docx")
+async def extract_docx_endpoint(file: UploadFile = File(...)):
+ """
+ Docx文档提取接口
+
+ Args:
+ file: 上传的Docx文件
+
+ Returns:
+ {
+ "success": true,
+ "method": "mammoth",
+ "text": "提取的文本内容",
+ "metadata": {
+ "char_count": 字符数,
+ "has_tables": 是否包含表格,
+ "file_size": 文件大小
+ }
+ }
+ """
+ temp_path = None
+
+ try:
+ # 验证文件类型
+ if not file.filename.lower().endswith('.docx'):
+ raise HTTPException(
+ status_code=400,
+ detail="文件格式错误,只支持Docx文件"
+ )
+
+ # 保存临时文件
+ temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
+
+ logger.info(f"开始处理Docx文件: {file.filename}")
+
+ with open(temp_path, "wb") as f:
+ content = await file.read()
+ f.write(content)
+
+ file_size = len(content)
+ logger.info(f"文件大小: {file_size / 1024:.2f} KB")
+
+ # 提取文本
+ result = extract_docx_mammoth(str(temp_path))
+
+ if not result["success"]:
+ raise HTTPException(
+ status_code=500,
+ detail=f"Docx提取失败: {result.get('error', 'Unknown error')}"
+ )
+
+ # 添加文件元数据
+ result["method"] = "mammoth"
+ result["metadata"]["filename"] = file.filename
+
+ logger.info(f"Docx提取成功: {file.filename}, "
+ f"字符数={result['metadata']['char_count']}")
+
+ return JSONResponse(content=result)
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"Docx提取失败: {str(e)}")
+ raise HTTPException(
+ status_code=500,
+ detail=f"处理失败: {str(e)}"
+ )
+
+ finally:
+ if temp_path:
+ cleanup_temp_file(temp_path)
+
+
+@app.post("/api/extract/txt")
+async def extract_txt_endpoint(file: UploadFile = File(...)):
+ """
+ Txt文本文件提取接口
+
+ Args:
+ file: 上传的Txt文件
+
+ Returns:
+ {
+ "success": true,
+ "method": "direct",
+ "text": "文本内容",
+ "encoding": "utf-8",
+ "metadata": {
+ "char_count": 字符数,
+ "line_count": 行数,
+ "file_size": 文件大小
+ }
+ }
+ """
+ temp_path = None
+
+ try:
+ # 验证文件类型
+ if not file.filename.lower().endswith('.txt'):
+ raise HTTPException(
+ status_code=400,
+ detail="文件格式错误,只支持Txt文件"
+ )
+
+ # 保存临时文件
+ temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
+
+ logger.info(f"开始处理Txt文件: {file.filename}")
+
+ with open(temp_path, "wb") as f:
+ content = await file.read()
+ f.write(content)
+
+ file_size = len(content)
+ logger.info(f"文件大小: {file_size / 1024:.2f} KB")
+
+ # 提取文本
+ result = extract_txt(str(temp_path))
+
+ if not result["success"]:
+ raise HTTPException(
+ status_code=500,
+ detail=f"Txt提取失败: {result.get('error', 'Unknown error')}"
+ )
+
+ # 添加方法标识和文件名
+ result["method"] = "direct"
+ result["metadata"]["filename"] = file.filename
+
+ logger.info(f"Txt提取成功: {file.filename}, "
+ f"编码={result['encoding']}, "
+ f"字符数={result['metadata']['char_count']}")
+
+ return JSONResponse(content=result)
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"Txt提取失败: {str(e)}")
+ raise HTTPException(
+ status_code=500,
+ detail=f"处理失败: {str(e)}"
+ )
+
+ finally:
+ if temp_path:
+ cleanup_temp_file(temp_path)
+
+
+@app.post("/api/extract")
+async def extract_document(
+ file: UploadFile = File(...),
+ file_type: str = None
+):
+ """
+ 通用文档提取接口
+
+ 自动检测文件类型并调用相应的提取方法
+
+ Args:
+ file: 上传的文件
+ file_type: 可选,指定文件类型 ('pdf' | 'docx' | 'txt')
+
+ Returns:
+ 提取结果
+ """
+ try:
+ # 自动检测文件类型
+ if not file_type:
+ file_type = detect_file_type(file.filename)
+
+ logger.info(f"文件类型: {file_type}, 文件名: {file.filename}")
+
+ # 根据类型调用不同的处理函数
+ if file_type == 'pdf':
+ return await extract_pdf_endpoint(file)
+ elif file_type == 'docx':
+ return await extract_docx_endpoint(file)
+ elif file_type == 'txt':
+ return await extract_txt_endpoint(file)
+ else:
+ raise HTTPException(
+ status_code=400,
+ detail=f"不支持的文件格式: {file_type},仅支持PDF、Docx、Txt"
+ )
+
+ except HTTPException:
+ raise
+ except Exception as e:
+ logger.error(f"文档提取失败: {str(e)}")
+ raise HTTPException(
+ status_code=500,
+ detail=f"处理失败: {str(e)}"
+ )
+
+
+# ==================== 启动配置 ====================
+
+if __name__ == "__main__":
+ import uvicorn
+
+ port = int(os.getenv("SERVICE_PORT", 8000))
+ host = os.getenv("SERVICE_HOST", "0.0.0.0")
+ debug = os.getenv("DEBUG", "True").lower() == "true"
+
+ logger.info(f"启动文档提取微服务...")
+ logger.info(f"地址: http://{host}:{port}")
+ logger.info(f"健康检查: http://{host}:{port}/api/health")
+ logger.info(f"调试模式: {debug}")
+
+ uvicorn.run(
+ "main:app",
+ host=host,
+ port=port,
+ reload=debug,
+ log_level="info"
+ )
+
diff --git a/extraction_service/requirements.txt b/extraction_service/requirements.txt
new file mode 100644
index 00000000..cf136361
--- /dev/null
+++ b/extraction_service/requirements.txt
@@ -0,0 +1,31 @@
+# FastAPI核心依赖
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+python-multipart==0.0.6
+
+# PDF处理
+PyMuPDF>=1.24.0 # 使用更新版本,有预编译wheel
+pdfplumber==0.10.3
+nougat-ocr==0.1.17 # 学术PDF高质量提取(英文)
+albumentations==1.3.1 # Nougat兼容版本(不要升级到2.x)
+
+# Docx处理(Day 3需要)
+mammoth==1.6.0
+python-docx==1.1.0
+
+# 语言检测(Day 2需要)
+langdetect==1.0.9
+
+# 编码检测(Day 3需要)
+chardet==5.2.0
+
+# 工具
+python-dotenv==1.0.0
+pydantic>=2.10.0 # 使用更新版本,有预编译wheel
+
+# 日志
+loguru==0.7.2
+
+# 测试工具
+requests==2.31.0
+
diff --git a/extraction_service/services/__init__.py b/extraction_service/services/__init__.py
new file mode 100644
index 00000000..e9a7402c
--- /dev/null
+++ b/extraction_service/services/__init__.py
@@ -0,0 +1,11 @@
+"""
+服务模块
+
+包含各种文档提取和处理服务
+"""
+
+
+
+
+
+
diff --git a/extraction_service/services/docx_extractor.py b/extraction_service/services/docx_extractor.py
new file mode 100644
index 00000000..e911f2a9
--- /dev/null
+++ b/extraction_service/services/docx_extractor.py
@@ -0,0 +1,257 @@
+"""
+Docx文档提取服务
+
+使用Mammoth库提取Word文档文本
+支持.docx格式(不支持老版.doc)
+"""
+
+import mammoth
+from pathlib import Path
+from typing import Dict, Any
+from loguru import logger
+
+
+def extract_docx_mammoth(file_path: str) -> Dict[str, Any]:
+ """
+ 使用Mammoth提取Docx文本
+
+ Mammoth特点:
+ - 转换为纯文本或HTML
+ - 保留基本格式信息
+ - 处理表格、列表等结构
+
+ Args:
+ file_path: Docx文件路径
+
+ Returns:
+ {
+ "success": True,
+ "text": "提取的文本内容",
+ "format": "plain_text",
+ "metadata": {
+ "char_count": 字符数,
+ "has_tables": 是否包含表格,
+ "file_size": 文件大小
+ }
+ }
+ """
+ try:
+ file_path_obj = Path(file_path)
+
+ # 验证文件存在
+ if not file_path_obj.exists():
+ return {
+ "success": False,
+ "error": f"文件不存在: {file_path}",
+ "text": "",
+ "metadata": {}
+ }
+
+ # 验证文件格式
+ if file_path_obj.suffix.lower() != '.docx':
+ return {
+ "success": False,
+ "error": f"不支持的文件格式: {file_path_obj.suffix},仅支持.docx",
+ "text": "",
+ "metadata": {}
+ }
+
+ logger.info(f"开始提取Docx文件: {file_path_obj.name}")
+
+ # 使用Mammoth提取纯文本
+ with open(file_path, "rb") as docx_file:
+ result = mammoth.extract_raw_text(docx_file)
+ text = result.value # 提取的文本
+ messages = result.messages # 警告/错误信息
+
+ # 检查是否有警告
+ if messages:
+ logger.warning(f"Mammoth提取警告: {len(messages)}个")
+ for msg in messages:
+ logger.debug(f" - {msg.type}: {msg.message}")
+
+ # 简单的质量检查
+ char_count = len(text)
+ if char_count == 0:
+ logger.warning("提取的文本为空")
+ return {
+ "success": False,
+ "error": "文档内容为空或无法提取",
+ "text": "",
+ "metadata": {
+ "char_count": 0,
+ "file_size": file_path_obj.stat().st_size
+ }
+ }
+
+ # 简单判断是否包含表格(通过制表符或特殊结构)
+ has_tables = '\t' in text or '|' in text
+
+ logger.info(f"Docx提取成功: {char_count}个字符")
+
+ return {
+ "success": True,
+ "text": text,
+ "format": "plain_text",
+ "metadata": {
+ "char_count": char_count,
+ "has_tables": has_tables,
+ "file_size": file_path_obj.stat().st_size,
+ "warnings": len(messages)
+ }
+ }
+
+ except Exception as e:
+ logger.error(f"Docx提取失败: {str(e)}")
+ return {
+ "success": False,
+ "error": str(e),
+ "text": "",
+ "metadata": {}
+ }
+
+
+def extract_docx_html(file_path: str) -> Dict[str, Any]:
+ """
+ 使用Mammoth提取Docx为HTML格式(保留更多格式)
+
+ Args:
+ file_path: Docx文件路径
+
+ Returns:
+ {
+ "success": True,
+ "html": "HTML格式的文本",
+ "format": "html",
+ "metadata": {...}
+ }
+ """
+ try:
+ file_path_obj = Path(file_path)
+
+ if not file_path_obj.exists():
+ return {
+ "success": False,
+ "error": f"文件不存在: {file_path}",
+ "html": "",
+ "metadata": {}
+ }
+
+ logger.info(f"开始提取Docx为HTML: {file_path_obj.name}")
+
+ # 提取为HTML
+ with open(file_path, "rb") as docx_file:
+ result = mammoth.convert_to_html(docx_file)
+ html = result.value
+ messages = result.messages
+
+ if messages:
+ logger.warning(f"HTML转换警告: {len(messages)}个")
+
+ logger.info(f"HTML提取成功: {len(html)}个字符")
+
+ return {
+ "success": True,
+ "html": html,
+ "format": "html",
+ "metadata": {
+ "html_length": len(html),
+ "file_size": file_path_obj.stat().st_size,
+ "warnings": len(messages)
+ }
+ }
+
+ except Exception as e:
+ logger.error(f"HTML提取失败: {str(e)}")
+ return {
+ "success": False,
+ "error": str(e),
+ "html": "",
+ "metadata": {}
+ }
+
+
+def validate_docx_file(file_path: str) -> Dict[str, Any]:
+ """
+ 验证Docx文件的有效性
+
+ Args:
+ file_path: 文件路径
+
+ Returns:
+ {
+ "valid": True/False,
+ "reason": "原因",
+ "file_info": {文件信息}
+ }
+ """
+ try:
+ file_path_obj = Path(file_path)
+
+ # 检查文件存在
+ if not file_path_obj.exists():
+ return {
+ "valid": False,
+ "reason": "文件不存在"
+ }
+
+ # 检查后缀
+ if file_path_obj.suffix.lower() != '.docx':
+ return {
+ "valid": False,
+ "reason": f"不支持的格式: {file_path_obj.suffix}(仅支持.docx)"
+ }
+
+ # 检查文件大小(限制50MB)
+ file_size = file_path_obj.stat().st_size
+ max_size = 50 * 1024 * 1024 # 50MB
+
+ if file_size > max_size:
+ return {
+ "valid": False,
+ "reason": f"文件过大: {file_size / 1024 / 1024:.2f}MB(限制50MB)"
+ }
+
+ if file_size == 0:
+ return {
+ "valid": False,
+ "reason": "文件为空"
+ }
+
+ # 尝试打开文件(基本有效性检查)
+ try:
+ with open(file_path, "rb") as f:
+ # 读取前4个字节检查ZIP签名(docx本质是ZIP文件)
+ signature = f.read(4)
+ if signature != b'PK\x03\x04':
+ return {
+ "valid": False,
+ "reason": "不是有效的Docx文件(ZIP签名错误)"
+ }
+ except Exception as e:
+ return {
+ "valid": False,
+ "reason": f"无法读取文件: {str(e)}"
+ }
+
+ return {
+ "valid": True,
+ "reason": "文件有效",
+ "file_info": {
+ "filename": file_path_obj.name,
+ "size": file_size,
+ "size_mb": round(file_size / 1024 / 1024, 2)
+ }
+ }
+
+ except Exception as e:
+ return {
+ "valid": False,
+ "reason": f"验证失败: {str(e)}"
+ }
+
+
+
+
+
+
diff --git a/extraction_service/services/file_utils.py b/extraction_service/services/file_utils.py
new file mode 100644
index 00000000..55f51334
--- /dev/null
+++ b/extraction_service/services/file_utils.py
@@ -0,0 +1,88 @@
+"""
+文件工具函数
+"""
+
+import os
+from pathlib import Path
+from loguru import logger
+
+
+def detect_file_type(filename: str) -> str:
+ """
+ 根据文件名检测文件类型
+
+ Args:
+ filename: 文件名
+
+ Returns:
+ 文件类型: 'pdf' | 'docx' | 'txt'
+
+ Raises:
+ ValueError: 不支持的文件格式
+ """
+ ext = filename.lower().split('.')[-1]
+
+ if ext == 'pdf':
+ return 'pdf'
+ elif ext == 'docx':
+ return 'docx'
+ elif ext == 'txt':
+ return 'txt'
+ else:
+ raise ValueError(f"不支持的文件格式: .{ext}")
+
+
+def cleanup_temp_file(file_path: Path | str) -> None:
+ """
+ 清理临时文件
+
+ Args:
+ file_path: 文件路径
+ """
+ try:
+ if isinstance(file_path, str):
+ file_path = Path(file_path)
+
+ if file_path.exists():
+ file_path.unlink()
+ logger.debug(f"清理临时文件: {file_path}")
+ except Exception as e:
+ logger.warning(f"清理临时文件失败: {str(e)}")
+
+
+def get_file_size_mb(file_path: Path | str) -> float:
+ """
+ 获取文件大小(MB)
+
+ Args:
+ file_path: 文件路径
+
+ Returns:
+ 文件大小(MB)
+ """
+ if isinstance(file_path, str):
+ file_path = Path(file_path)
+
+ if file_path.exists():
+ return file_path.stat().st_size / (1024 * 1024)
+ return 0.0
+
+
+def validate_file_size(file_size: int, max_size: int = 52428800) -> bool:
+ """
+ 验证文件大小
+
+ Args:
+ file_size: 文件大小(字节)
+ max_size: 最大允许大小(字节),默认50MB
+
+ Returns:
+ 是否通过验证
+ """
+ return file_size <= max_size
+
+
+
+
+
+
diff --git a/extraction_service/services/language_detector.py b/extraction_service/services/language_detector.py
new file mode 100644
index 00000000..db03ad3b
--- /dev/null
+++ b/extraction_service/services/language_detector.py
@@ -0,0 +1,160 @@
+"""
+语言检测服务
+
+检测PDF文档的主要语言(中文/英文/混合)
+用于决定使用哪种提取方法
+"""
+
+import pdfplumber
+from typing import Dict, Any
+from loguru import logger
+
+
+def detect_language(pdf_path: str) -> str:
+ """
+ 检测PDF主要语言
+
+ 策略:
+ 1. 提取前3页文本(代表性强)
+ 2. 统计中文字符比例
+ 3. 判断语言类型
+
+ Args:
+ pdf_path: PDF文件路径
+
+ Returns:
+ 'chinese' | 'english' | 'mixed'
+ """
+ try:
+ logger.info(f"开始语言检测: {pdf_path}")
+
+ with pdfplumber.open(pdf_path) as pdf:
+ # 提取前3页文本(或全部如果少于3页)
+ sample_pages = min(3, len(pdf.pages))
+ sample_text = ""
+
+ for i in range(sample_pages):
+ try:
+ page_text = pdf.pages[i].extract_text()
+ if page_text:
+ sample_text += page_text + "\n"
+ except Exception as e:
+ logger.warning(f"第{i+1}页文本提取失败: {str(e)}")
+ continue
+
+ # 检查是否有足够文本
+ if len(sample_text.strip()) < 100:
+ logger.warning("文本太少,默认使用英文处理")
+ return 'english'
+
+ # 统计中文字符比例
+ chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff'])
+ total_chars = len([c for c in sample_text if c.strip()])
+
+ if total_chars == 0:
+ logger.warning("无有效字符,默认使用英文处理")
+ return 'english'
+
+ chinese_ratio = chinese_chars / total_chars
+
+ logger.info(f"中文字符比例: {chinese_ratio:.2%} ({chinese_chars}/{total_chars})")
+
+ # 判断语言
+ # 阈值说明:
+ # - > 30%: 判定为中文PDF(包括中英混合但中文为主)
+ # - <= 30%: 判定为英文PDF
+ if chinese_ratio > 0.3:
+ language = 'chinese'
+ else:
+ language = 'english'
+
+ logger.info(f"检测结果: {language}")
+ return language
+
+ except Exception as e:
+ logger.error(f"语言检测失败: {str(e)},默认使用英文处理")
+ return 'english'
+
+
+def detect_language_detailed(pdf_path: str) -> Dict[str, Any]:
+ """
+ 详细的语言检测
+
+ 返回更多统计信息
+
+ Args:
+ pdf_path: PDF文件路径
+
+ Returns:
+ {
+ "language": "chinese" | "english" | "mixed",
+ "chinese_ratio": 0.65,
+ "chinese_chars": 3500,
+ "total_chars": 5000,
+ "sample_pages": 3,
+ "sample_text_length": 5000
+ }
+ """
+ try:
+ with pdfplumber.open(pdf_path) as pdf:
+ sample_pages = min(3, len(pdf.pages))
+ sample_text = ""
+
+ for i in range(sample_pages):
+ try:
+ page_text = pdf.pages[i].extract_text()
+ if page_text:
+ sample_text += page_text + "\n"
+ except:
+ continue
+
+ # 统计
+ chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff'])
+ total_chars = len([c for c in sample_text if c.strip()])
+
+ chinese_ratio = chinese_chars / total_chars if total_chars > 0 else 0
+
+ # 判断语言
+ if chinese_ratio > 0.3:
+ language = 'chinese'
+ elif chinese_ratio > 0.1:
+ language = 'mixed'
+ else:
+ language = 'english'
+
+ return {
+ "language": language,
+ "chinese_ratio": round(chinese_ratio, 4),
+ "chinese_chars": chinese_chars,
+ "total_chars": total_chars,
+ "sample_pages": sample_pages,
+ "sample_text_length": len(sample_text)
+ }
+
+ except Exception as e:
+ logger.error(f"详细语言检测失败: {str(e)}")
+ return {
+ "language": "english",
+ "error": str(e)
+ }
+
+
+def is_chinese_pdf(pdf_path: str, threshold: float = 0.3) -> bool:
+ """
+ 简单判断是否为中文PDF
+
+ Args:
+ pdf_path: PDF文件路径
+ threshold: 中文字符比例阈值,默认30%
+
+ Returns:
+ True if 中文字符比例 > threshold
+ """
+ language = detect_language(pdf_path)
+ return language == 'chinese'
+
+
+
+
+
+
diff --git a/extraction_service/services/nougat_extractor.py b/extraction_service/services/nougat_extractor.py
new file mode 100644
index 00000000..0fa11c66
--- /dev/null
+++ b/extraction_service/services/nougat_extractor.py
@@ -0,0 +1,241 @@
+"""
+Nougat提取服务
+
+使用Nougat OCR提取学术PDF的高质量文本
+保留表格、公式等结构信息
+"""
+
+import subprocess
+import os
+from pathlib import Path
+from typing import Dict, Any, Optional, Callable
+from loguru import logger
+
+
+def check_nougat_available() -> bool:
+ """
+ 检查Nougat是否已安装
+
+ Returns:
+ True if Nougat可用
+ """
+ try:
+ # 方法1: 尝试导入nougat模块
+ import nougat
+ logger.info(f"Nougat module is available (version: {getattr(nougat, '__version__', 'unknown')})")
+ return True
+ except ImportError:
+ logger.warning("Nougat module not found")
+ return False
+ except Exception as e:
+ logger.error(f"检查Nougat失败: {str(e)}")
+ return False
+
+
+def extract_pdf_nougat(
+ file_path: str,
+ output_dir: Optional[str] = None,
+ progress_callback: Optional[Callable[[int, int], None]] = None
+) -> Dict[str, Any]:
+ """
+ 使用Nougat提取PDF文本
+
+ Args:
+ file_path: PDF文件路径
+ output_dir: 输出目录,默认为临时目录
+ progress_callback: 进度回调函数 (current_page, total_pages)
+
+ Returns:
+ {
+ "success": True,
+ "method": "nougat",
+ "text": "提取的Markdown文本",
+ "format": "markdown",
+ "metadata": {
+ "page_count": 20,
+ "char_count": 50000,
+ "quality_score": 0.95,
+ "has_tables": True,
+ "has_formulas": True
+ }
+ }
+ """
+ try:
+ # 检查Nougat是否可用
+ if not check_nougat_available():
+ raise Exception("Nougat未安装,请先安装:pip install nougat-ocr")
+
+ logger.info(f"开始使用Nougat提取: {file_path}")
+
+ # 准备输出目录
+ if output_dir is None:
+ output_dir = os.path.join(os.path.dirname(file_path), "nougat_output")
+
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+ # 构建Nougat命令
+ # nougat命令格式:nougat -o
+ cmd = [
+ 'nougat',
+ file_path,
+ '-o', output_dir,
+ '--markdown', # 输出Markdown格式
+ '--no-skipping' # 不跳过任何页面
+ ]
+
+ logger.info(f"执行命令: {' '.join(cmd)}")
+
+ # 执行Nougat
+ # 注意:Nougat可能需要较长时间(1-2分钟/20页)
+ process = subprocess.Popen(
+ cmd,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ text=True
+ )
+
+ # 等待完成
+ stdout, stderr = process.communicate(timeout=300) # 5分钟超时
+
+ if process.returncode != 0:
+ logger.error(f"Nougat执行失败: {stderr}")
+ raise Exception(f"Nougat执行失败: {stderr}")
+
+ # 读取输出文件
+ # Nougat会生成 .mmd 文件
+ pdf_name = Path(file_path).stem
+ output_file = Path(output_dir) / f"{pdf_name}.mmd"
+
+ if not output_file.exists():
+ raise Exception(f"Nougat输出文件不存在: {output_file}")
+
+ with open(output_file, 'r', encoding='utf-8') as f:
+ markdown_text = f.read()
+
+ # 评估质量
+ quality_result = evaluate_nougat_quality(markdown_text)
+
+ logger.info(f"Nougat提取完成: 质量={quality_result['quality_score']:.2f}")
+
+ return {
+ "success": True,
+ "method": "nougat",
+ "text": markdown_text,
+ "format": "markdown",
+ "metadata": {
+ "char_count": len(markdown_text),
+ "quality_score": quality_result['quality_score'],
+ "has_tables": quality_result['has_tables'],
+ "has_formulas": quality_result['has_formulas'],
+ "has_structure": quality_result['has_structure']
+ }
+ }
+
+ except subprocess.TimeoutExpired:
+ logger.error("Nougat处理超时(>5分钟)")
+ return {
+ "success": False,
+ "error": "处理超时",
+ "method": "nougat"
+ }
+
+ except Exception as e:
+ logger.error(f"Nougat提取失败: {str(e)}")
+ return {
+ "success": False,
+ "error": str(e),
+ "method": "nougat"
+ }
+
+
+def evaluate_nougat_quality(text: str) -> Dict[str, Any]:
+ """
+ 评估Nougat提取质量
+
+ 评分标准:
+ - 基础分:0.5
+ - 有章节结构:+0.2
+ - 有表格:+0.15
+ - 有公式:+0.15
+ - 文本长度充足:+0.1
+ - 乱码检测:-0.3
+
+ Args:
+ text: Nougat提取的Markdown文本
+
+ Returns:
+ {
+ "quality_score": 0.92,
+ "has_structure": True,
+ "has_tables": True,
+ "has_formulas": True,
+ "has_garbled": False
+ }
+ """
+ score = 0.5 # 基础分
+
+ # 检查章节结构(Markdown标题)
+ has_structure = bool(text.count('##') >= 2 or text.count('#') >= 3)
+ if has_structure:
+ score += 0.2
+
+ # 检查表格
+ has_tables = '|' in text and '---' in text
+ if has_tables:
+ score += 0.15
+
+ # 检查公式(LaTeX格式)
+ has_formulas = '$$' in text or '$' in text or '\\(' in text
+ if has_formulas:
+ score += 0.15
+
+ # 检查文本长度
+ if len(text) > 5000: # 至少5000字符
+ score += 0.1
+
+ # 检查乱码(简单启发式)
+ # 大量重复字符或特殊符号可能表示乱码
+ garbled_chars = sum(1 for c in text if ord(c) > 65535 or c in '��')
+ has_garbled = garbled_chars > len(text) * 0.05 # 超过5%
+ if has_garbled:
+ score -= 0.3
+
+ # 确保分数在0-1之间
+ score = max(0.0, min(1.0, score))
+
+ return {
+ "quality_score": score,
+ "has_structure": has_structure,
+ "has_tables": has_tables,
+ "has_formulas": has_formulas,
+ "has_garbled": has_garbled
+ }
+
+
+def get_nougat_info() -> Dict[str, Any]:
+ """
+ 获取Nougat信息
+
+ Returns:
+ Nougat版本和状态信息
+ """
+ try:
+ import nougat
+ version = getattr(nougat, '__version__', 'unknown')
+ return {
+ "available": True,
+ "version": version
+ }
+
+ except ImportError:
+ return {
+ "available": False,
+ "error": "Nougat未安装"
+ }
+
+ except Exception as e:
+ return {
+ "available": False,
+ "error": str(e)
+ }
+
diff --git a/extraction_service/services/pdf_extractor.py b/extraction_service/services/pdf_extractor.py
new file mode 100644
index 00000000..5c1d823c
--- /dev/null
+++ b/extraction_service/services/pdf_extractor.py
@@ -0,0 +1,191 @@
+"""
+PDF文本提取服务
+
+使用PyMuPDF (fitz)提取PDF文本内容
+"""
+
+import fitz # PyMuPDF
+from typing import Dict, Any
+from loguru import logger
+
+
+def extract_pdf_pymupdf(file_path: str) -> Dict[str, Any]:
+ """
+ 使用PyMuPDF提取PDF文本
+
+ Args:
+ file_path: PDF文件路径
+
+ Returns:
+ {
+ "success": True,
+ "method": "pymupdf",
+ "text": "提取的文本",
+ "metadata": {
+ "page_count": 20,
+ "char_count": 50000,
+ "has_text": True
+ }
+ }
+ """
+ try:
+ logger.info(f"开始使用PyMuPDF提取: {file_path}")
+
+ # 打开PDF
+ doc = fitz.open(file_path)
+ page_count = len(doc)
+
+ logger.info(f"PDF页数: {page_count}")
+
+ # 提取所有页面的文本
+ text_parts = []
+
+ for page_num in range(page_count):
+ try:
+ page = doc[page_num]
+ text = page.get_text()
+
+ if text.strip():
+ # 添加页面分隔符
+ text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n")
+ text_parts.append(text)
+
+ logger.debug(f"第 {page_num + 1} 页提取了 {len(text)} 个字符")
+
+ except Exception as e:
+ logger.warning(f"第 {page_num + 1} 页提取失败: {str(e)}")
+ continue
+
+ # 合并文本
+ full_text = "".join(text_parts)
+ char_count = len(full_text)
+
+ # 关闭文档
+ doc.close()
+
+ # 检查是否提取到文本
+ has_text = char_count > 100 # 至少要有100个字符
+
+ if not has_text:
+ logger.warning(f"PDF可能是扫描版或无文本内容")
+
+ logger.info(f"PyMuPDF提取完成: 字符数={char_count}")
+
+ return {
+ "success": True,
+ "method": "pymupdf",
+ "text": full_text,
+ "format": "plain_text",
+ "metadata": {
+ "page_count": page_count,
+ "char_count": char_count,
+ "has_text": has_text
+ }
+ }
+
+ except Exception as e:
+ logger.error(f"PyMuPDF提取失败: {str(e)}")
+ return {
+ "success": False,
+ "error": str(e),
+ "method": "pymupdf"
+ }
+
+
+def extract_pdf_with_layout(file_path: str) -> Dict[str, Any]:
+ """
+ 使用PyMuPDF提取PDF文本(保留布局)
+
+ Args:
+ file_path: PDF文件路径
+
+ Returns:
+ 提取结果
+ """
+ try:
+ logger.info(f"开始使用PyMuPDF提取(保留布局): {file_path}")
+
+ doc = fitz.open(file_path)
+ page_count = len(doc)
+
+ text_parts = []
+
+ for page_num in range(page_count):
+ try:
+ page = doc[page_num]
+
+ # 使用dict模式提取,可以保留更多格式信息
+ blocks = page.get_text("dict")["blocks"]
+
+ page_text = []
+
+ for block in blocks:
+ if block["type"] == 0: # 文本块
+ for line in block.get("lines", []):
+ for span in line.get("spans", []):
+ text = span.get("text", "")
+ if text.strip():
+ page_text.append(text)
+
+ if page_text:
+ text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n")
+ text_parts.append(" ".join(page_text))
+
+ except Exception as e:
+ logger.warning(f"第 {page_num + 1} 页处理失败: {str(e)}")
+ continue
+
+ full_text = "".join(text_parts)
+ doc.close()
+
+ return {
+ "success": True,
+ "method": "pymupdf_layout",
+ "text": full_text,
+ "format": "plain_text",
+ "metadata": {
+ "page_count": page_count,
+ "char_count": len(full_text)
+ }
+ }
+
+ except Exception as e:
+ logger.error(f"PyMuPDF布局提取失败: {str(e)}")
+ return {
+ "success": False,
+ "error": str(e)
+ }
+
+
+def get_pdf_metadata(file_path: str) -> Dict[str, Any]:
+ """
+ 获取PDF元数据
+
+ Args:
+ file_path: PDF文件路径
+
+ Returns:
+ PDF元数据
+ """
+ try:
+ doc = fitz.open(file_path)
+
+ metadata = {
+ "page_count": len(doc),
+ "metadata": doc.metadata,
+ "is_encrypted": doc.is_encrypted,
+ "is_pdf": doc.is_pdf
+ }
+
+ doc.close()
+ return metadata
+
+ except Exception as e:
+ logger.error(f"获取PDF元数据失败: {str(e)}")
+ return {}
+
+
+
+
+
+
diff --git a/extraction_service/services/pdf_processor.py b/extraction_service/services/pdf_processor.py
new file mode 100644
index 00000000..9754c99f
--- /dev/null
+++ b/extraction_service/services/pdf_processor.py
@@ -0,0 +1,192 @@
+"""
+PDF处理主服务
+
+实现顺序降级策略:
+1. 检测语言
+2. 中文PDF → PyMuPDF(快速)
+3. 英文PDF → Nougat → 失败降级PyMuPDF
+"""
+
+from typing import Dict, Any, Optional
+from loguru import logger
+
+from .language_detector import detect_language
+from .nougat_extractor import extract_pdf_nougat, check_nougat_available
+from .pdf_extractor import extract_pdf_pymupdf
+
+
+def extract_pdf(
+ file_path: str,
+ force_method: Optional[str] = None
+) -> Dict[str, Any]:
+ """
+ PDF提取主函数(顺序降级策略)
+
+ 处理流程:
+ 1. 检测语言
+ 2. 中文 → 直接PyMuPDF
+ 3. 英文 → 尝试Nougat → 失败降级PyMuPDF
+
+ Args:
+ file_path: PDF文件路径
+ force_method: 强制使用的方法 ('nougat' | 'pymupdf')
+
+ Returns:
+ {
+ "success": True,
+ "method": "nougat" | "pymupdf",
+ "reason": "chinese_pdf" | "english_pdf" | "nougat_failed" | "nougat_low_quality",
+ "text": "提取的文本",
+ "metadata": {...}
+ }
+ """
+ try:
+ logger.info(f"开始处理PDF: {file_path}")
+
+ # Step 1: 语言检测
+ logger.info("[Step 1] 检测PDF语言...")
+ language = detect_language(file_path)
+ logger.info(f"检测结果: {language}")
+
+ # 如果强制指定方法
+ if force_method:
+ logger.info(f"强制使用方法: {force_method}")
+
+ if force_method == 'nougat':
+ return extract_pdf_nougat(file_path)
+ elif force_method == 'pymupdf':
+ result = extract_pdf_pymupdf(file_path)
+ result['reason'] = 'force_pymupdf'
+ return result
+
+ # Step 2: 中文PDF → 直接PyMuPDF
+ if language == 'chinese':
+ logger.info("[Step 2] 中文PDF,使用PyMuPDF快速处理")
+
+ result = extract_pdf_pymupdf(file_path)
+
+ if result['success']:
+ result['reason'] = 'chinese_pdf'
+ result['detected_language'] = language
+ logger.info("✅ PyMuPDF处理成功(中文PDF)")
+ return result
+ else:
+ logger.error("❌ PyMuPDF处理失败")
+ return result
+
+ # Step 3: 英文PDF → 尝试Nougat
+ logger.info("[Step 3] 英文PDF,尝试Nougat高质量解析")
+
+ # 检查Nougat是否可用
+ if not check_nougat_available():
+ logger.warning("⚠️ Nougat不可用,降级到PyMuPDF")
+
+ result = extract_pdf_pymupdf(file_path)
+ if result['success']:
+ result['reason'] = 'nougat_unavailable'
+ result['detected_language'] = language
+ return result
+
+ # 尝试Nougat
+ try:
+ nougat_result = extract_pdf_nougat(file_path)
+
+ if not nougat_result['success']:
+ logger.warning("⚠️ Nougat提取失败,降级到PyMuPDF")
+ raise Exception(nougat_result.get('error', 'Nougat failed'))
+
+ # 质量检查
+ quality_score = nougat_result['metadata'].get('quality_score', 0)
+
+ logger.info(f"Nougat质量评分: {quality_score:.2f}")
+
+ # 质量阈值:0.7
+ if quality_score >= 0.7:
+ logger.info("✅ Nougat处理成功(质量合格)")
+ nougat_result['reason'] = 'english_pdf_high_quality'
+ nougat_result['detected_language'] = language
+ return nougat_result
+ else:
+ logger.warning(f"⚠️ Nougat质量不足: {quality_score:.2f},降级到PyMuPDF")
+ raise Exception(f"Quality too low: {quality_score}")
+
+ except Exception as e:
+ logger.warning(f"Nougat处理失败: {str(e)},降级到PyMuPDF")
+
+ # Step 4: 降级到PyMuPDF
+ logger.info("[Step 4] 降级使用PyMuPDF")
+
+ result = extract_pdf_pymupdf(file_path)
+
+ if result['success']:
+ result['reason'] = 'nougat_failed_or_low_quality'
+ result['detected_language'] = language
+ result['fallback'] = True
+ logger.info("✅ PyMuPDF处理成功(降级方案)")
+ else:
+ logger.error("❌ PyMuPDF处理也失败了")
+
+ return result
+
+ except Exception as e:
+ logger.error(f"PDF处理完全失败: {str(e)}")
+ return {
+ "success": False,
+ "error": str(e),
+ "method": "unknown"
+ }
+
+
+def get_pdf_processing_strategy(file_path: str) -> Dict[str, Any]:
+ """
+ 获取PDF处理策略(不实际提取)
+
+ 用于预览将使用哪种方法
+
+ Args:
+ file_path: PDF文件路径
+
+ Returns:
+ {
+ "detected_language": "chinese" | "english",
+ "recommended_method": "nougat" | "pymupdf",
+ "reason": "...",
+ "nougat_available": True | False
+ }
+ """
+ try:
+ # 检测语言
+ language = detect_language(file_path)
+
+ # 检查Nougat可用性
+ nougat_available = check_nougat_available()
+
+ # 决定策略
+ if language == 'chinese':
+ recommended_method = 'pymupdf'
+ reason = '中文PDF,推荐使用PyMuPDF快速处理'
+ elif nougat_available:
+ recommended_method = 'nougat'
+ reason = '英文PDF,推荐使用Nougat高质量解析'
+ else:
+ recommended_method = 'pymupdf'
+ reason = 'Nougat不可用,使用PyMuPDF'
+
+ return {
+ "detected_language": language,
+ "recommended_method": recommended_method,
+ "reason": reason,
+ "nougat_available": nougat_available
+ }
+
+ except Exception as e:
+ logger.error(f"获取处理策略失败: {str(e)}")
+ return {
+ "error": str(e)
+ }
+
+
+
+
+
+
diff --git a/extraction_service/services/txt_extractor.py b/extraction_service/services/txt_extractor.py
new file mode 100644
index 00000000..b4e860e3
--- /dev/null
+++ b/extraction_service/services/txt_extractor.py
@@ -0,0 +1,320 @@
+"""
+Txt文本文件提取服务
+
+直接读取纯文本文件
+支持多种编码自动检测
+"""
+
+from pathlib import Path
+from typing import Dict, Any, List
+from loguru import logger
+import chardet
+
+
+def extract_txt(file_path: str) -> Dict[str, Any]:
+ """
+ 提取Txt文件内容
+
+ 特性:
+ - 自动检测编码(UTF-8, GBK, GB2312等)
+ - 支持大文件(逐块读取)
+ - 去除BOM标记
+
+ Args:
+ file_path: Txt文件路径
+
+ Returns:
+ {
+ "success": True,
+ "text": "文本内容",
+ "encoding": "检测到的编码",
+ "metadata": {
+ "char_count": 字符数,
+ "line_count": 行数,
+ "file_size": 文件大小
+ }
+ }
+ """
+ try:
+ file_path_obj = Path(file_path)
+
+ # 验证文件存在
+ if not file_path_obj.exists():
+ return {
+ "success": False,
+ "error": f"文件不存在: {file_path}",
+ "text": "",
+ "metadata": {}
+ }
+
+ # 验证文件格式
+ if file_path_obj.suffix.lower() != '.txt':
+ return {
+ "success": False,
+ "error": f"不支持的文件格式: {file_path_obj.suffix},仅支持.txt",
+ "text": "",
+ "metadata": {}
+ }
+
+ file_size = file_path_obj.stat().st_size
+
+ # 空文件检查
+ if file_size == 0:
+ return {
+ "success": False,
+ "error": "文件为空",
+ "text": "",
+ "metadata": {
+ "char_count": 0,
+ "line_count": 0,
+ "file_size": 0
+ }
+ }
+
+ logger.info(f"开始提取Txt文件: {file_path_obj.name} ({file_size / 1024:.2f} KB)")
+
+ # 检测编码
+ detected_encoding = detect_encoding(file_path)
+ logger.info(f"检测到编码: {detected_encoding}")
+
+ # 读取文件(带编码回退)
+ text, actual_encoding = read_with_fallback(file_path, detected_encoding)
+
+ if text is None:
+ return {
+ "success": False,
+ "error": "无法解码文件,尝试了多种编码均失败",
+ "text": "",
+ "metadata": {}
+ }
+
+ # 统计信息
+ char_count = len(text)
+ line_count = text.count('\n') + 1
+
+ logger.info(f"Txt提取成功: {char_count}个字符, {line_count}行")
+
+ return {
+ "success": True,
+ "text": text,
+ "encoding": actual_encoding,
+ "metadata": {
+ "char_count": char_count,
+ "line_count": line_count,
+ "file_size": file_size,
+ "size_kb": round(file_size / 1024, 2)
+ }
+ }
+
+ except Exception as e:
+ logger.error(f"Txt提取失败: {str(e)}")
+ return {
+ "success": False,
+ "error": str(e),
+ "text": "",
+ "metadata": {}
+ }
+
+
+def detect_encoding(file_path: str, sample_size: int = 10000) -> str:
+ """
+ 检测文件编码
+
+ Args:
+ file_path: 文件路径
+ sample_size: 采样大小(字节)
+
+ Returns:
+ 检测到的编码名称
+ """
+ try:
+ with open(file_path, 'rb') as f:
+ raw_data = f.read(sample_size)
+
+ # 使用chardet检测
+ result = chardet.detect(raw_data)
+ encoding = result['encoding']
+ confidence = result['confidence']
+
+ logger.debug(f"编码检测: {encoding} (置信度: {confidence:.2f})")
+
+ # 如果置信度太低,使用UTF-8作为默认
+ if confidence < 0.7:
+ logger.warning(f"编码置信度较低({confidence:.2f}),将尝试UTF-8")
+ return 'utf-8'
+
+ return encoding if encoding else 'utf-8'
+
+ except Exception as e:
+ logger.warning(f"编码检测失败: {str(e)},使用UTF-8")
+ return 'utf-8'
+
+
+def read_with_fallback(file_path: str, primary_encoding: str) -> tuple[str, str]:
+ """
+ 尝试多种编码读取文件
+
+ Args:
+ file_path: 文件路径
+ primary_encoding: 首选编码
+
+ Returns:
+ (文本内容, 实际使用的编码)
+ """
+ # 编码尝试列表(按优先级)
+ encodings = [
+ primary_encoding,
+ 'utf-8',
+ 'utf-8-sig', # UTF-8 with BOM
+ 'gbk',
+ 'gb2312',
+ 'gb18030',
+ 'latin-1',
+ 'cp1252',
+ 'iso-8859-1'
+ ]
+
+ # 去重并保持顺序
+ seen = set()
+ unique_encodings = []
+ for enc in encodings:
+ if enc and enc.lower() not in seen:
+ seen.add(enc.lower())
+ unique_encodings.append(enc)
+
+ # 尝试每种编码
+ for encoding in unique_encodings:
+ try:
+ with open(file_path, 'r', encoding=encoding, errors='strict') as f:
+ text = f.read()
+
+ logger.info(f"成功使用编码: {encoding}")
+ return text, encoding
+
+ except UnicodeDecodeError:
+ logger.debug(f"编码 {encoding} 解码失败,尝试下一个")
+ continue
+
+ except Exception as e:
+ logger.warning(f"使用编码 {encoding} 读取失败: {str(e)}")
+ continue
+
+ # 所有编码都失败
+ logger.error("所有编码尝试均失败")
+ return None, None
+
+
+def validate_txt_file(file_path: str) -> Dict[str, Any]:
+ """
+ 验证Txt文件的有效性
+
+ Args:
+ file_path: 文件路径
+
+ Returns:
+ {
+ "valid": True/False,
+ "reason": "原因",
+ "file_info": {文件信息}
+ }
+ """
+ try:
+ file_path_obj = Path(file_path)
+
+ # 检查文件存在
+ if not file_path_obj.exists():
+ return {
+ "valid": False,
+ "reason": "文件不存在"
+ }
+
+ # 检查后缀
+ if file_path_obj.suffix.lower() != '.txt':
+ return {
+ "valid": False,
+ "reason": f"不支持的格式: {file_path_obj.suffix}(仅支持.txt)"
+ }
+
+ # 检查文件大小(限制10MB,txt文件通常较小)
+ file_size = file_path_obj.stat().st_size
+ max_size = 10 * 1024 * 1024 # 10MB
+
+ if file_size > max_size:
+ return {
+ "valid": False,
+ "reason": f"文件过大: {file_size / 1024 / 1024:.2f}MB(限制10MB)"
+ }
+
+ if file_size == 0:
+ return {
+ "valid": False,
+ "reason": "文件为空"
+ }
+
+ # 尝试检测编码
+ encoding = detect_encoding(str(file_path_obj))
+
+ return {
+ "valid": True,
+ "reason": "文件有效",
+ "file_info": {
+ "filename": file_path_obj.name,
+ "size": file_size,
+ "size_kb": round(file_size / 1024, 2),
+ "detected_encoding": encoding
+ }
+ }
+
+ except Exception as e:
+ return {
+ "valid": False,
+ "reason": f"验证失败: {str(e)}"
+ }
+
+
+def preview_txt(file_path: str, lines: int = 10) -> Dict[str, Any]:
+ """
+ 预览Txt文件前几行
+
+ Args:
+ file_path: 文件路径
+ lines: 预览行数
+
+ Returns:
+ {
+ "success": True,
+ "preview": "前N行内容",
+ "total_lines": 总行数(如果能快速获取)
+ }
+ """
+ try:
+ result = extract_txt(file_path)
+
+ if not result['success']:
+ return result
+
+ text = result['text']
+ text_lines = text.split('\n')
+
+ preview_lines = text_lines[:lines]
+ preview = '\n'.join(preview_lines)
+
+ return {
+ "success": True,
+ "preview": preview,
+ "total_lines": len(text_lines),
+ "preview_lines": len(preview_lines)
+ }
+
+ except Exception as e:
+ return {
+ "success": False,
+ "error": str(e),
+ "preview": ""
+ }
+
+
+
+
+
+
diff --git a/extraction_service/start.bat b/extraction_service/start.bat
new file mode 100644
index 00000000..4a1781ef
--- /dev/null
+++ b/extraction_service/start.bat
@@ -0,0 +1,37 @@
+@echo off
+chcp 65001 >nul
+echo ================================
+echo 启动文档提取微服务
+echo ================================
+echo.
+
+REM 检查虚拟环境
+if exist venv\Scripts\activate.bat (
+ echo [1/3] 激活虚拟环境...
+ call venv\Scripts\activate
+) else (
+ echo 警告: 未找到虚拟环境,使用全局Python
+)
+
+REM 检查依赖
+echo [2/3] 检查依赖...
+pip list | findstr "fastapi" >nul
+if errorlevel 1 (
+ echo 依赖未安装,正在安装...
+ pip install -r requirements.txt
+)
+
+REM 启动服务
+echo [3/3] 启动服务...
+echo.
+echo 服务地址: http://localhost:8000
+echo 健康检查: http://localhost:8000/api/health
+echo API文档: http://localhost:8000/docs
+echo.
+echo 按 Ctrl+C 停止服务
+echo.
+
+uvicorn main:app --host 0.0.0.0 --port 8000 --reload
+
+pause
+
diff --git a/extraction_service/test_files/test.txt b/extraction_service/test_files/test.txt
new file mode 100644
index 00000000..2dd4c91c
--- /dev/null
+++ b/extraction_service/test_files/test.txt
@@ -0,0 +1,29 @@
+这是一个测试文本文件。
+用于测试Txt文件提取功能。
+
+AI临床研究平台 - Phase 2 Day 3测试
+
+功能特点:
+1. 自动编码检测
+2. 支持UTF-8、GBK等多种编码
+3. 统计字符数和行数
+4. 快速文本提取
+
+测试内容包含:
+- 中文字符
+- 英文字符 (English characters)
+- 数字 123456
+- 特殊符号 !@#$%^&*()
+
+多行文本测试:
+第一行
+第二行
+第三行
+
+结束。
+
+
+
+
+
+
diff --git a/extraction_service/test_service.py b/extraction_service/test_service.py
new file mode 100644
index 00000000..ba67860c
--- /dev/null
+++ b/extraction_service/test_service.py
@@ -0,0 +1,171 @@
+"""
+服务测试脚本
+
+测试文档提取微服务的各项功能
+"""
+
+import requests
+import sys
+from pathlib import Path
+
+
+BASE_URL = "http://localhost:8000"
+
+
+def test_health():
+ """测试健康检查"""
+ print("\n" + "="*50)
+ print("测试1: 健康检查")
+ print("="*50)
+
+ try:
+ response = requests.get(f"{BASE_URL}/api/health")
+ print(f"状态码: {response.status_code}")
+
+ if response.status_code == 200:
+ data = response.json()
+ print(f"服务状态: {data['status']}")
+ print(f"PyMuPDF: {data['checks']['pymupdf']['available']} (v{data['checks']['pymupdf']['version']})")
+ print(f"临时目录: {data['checks']['temp_dir']['path']}")
+ print("✅ 健康检查通过")
+ return True
+ else:
+ print("❌ 健康检查失败")
+ return False
+ except Exception as e:
+ print(f"❌ 连接失败: {str(e)}")
+ print("提示: 请确保服务已启动(python main.py)")
+ return False
+
+
+def test_pdf_extraction(pdf_file: str = None):
+ """测试PDF提取"""
+ print("\n" + "="*50)
+ print("测试2: PDF文本提取")
+ print("="*50)
+
+ if not pdf_file:
+ print("跳过: 未提供测试PDF文件")
+ print("使用方法: python test_service.py ")
+ return None
+
+ pdf_path = Path(pdf_file)
+
+ if not pdf_path.exists():
+ print(f"❌ 文件不存在: {pdf_file}")
+ return False
+
+ try:
+ print(f"上传文件: {pdf_path.name}")
+ print(f"文件大小: {pdf_path.stat().st_size / 1024:.2f} KB")
+
+ with open(pdf_path, 'rb') as f:
+ files = {'file': (pdf_path.name, f, 'application/pdf')}
+ response = requests.post(
+ f"{BASE_URL}/api/extract/pdf",
+ files=files
+ )
+
+ print(f"状态码: {response.status_code}")
+
+ if response.status_code == 200:
+ data = response.json()
+
+ print("\n提取结果:")
+ print(f"方法: {data['method']}")
+ print(f"页数: {data['metadata']['page_count']}")
+ print(f"字符数: {data['metadata']['char_count']}")
+ print(f"文本长度: {len(data['text'])} 字符")
+
+ # 显示前500字符
+ print("\n文本预览:")
+ print("-" * 50)
+ print(data['text'][:500])
+ if len(data['text']) > 500:
+ print("...")
+ print("-" * 50)
+
+ print("\n✅ PDF提取成功")
+ return True
+ else:
+ print(f"❌ 提取失败: {response.text}")
+ return False
+
+ except Exception as e:
+ print(f"❌ 请求失败: {str(e)}")
+ return False
+
+
+def test_root():
+ """测试根路径"""
+ print("\n" + "="*50)
+ print("测试0: 根路径")
+ print("="*50)
+
+ try:
+ response = requests.get(f"{BASE_URL}/")
+ print(f"状态码: {response.status_code}")
+
+ if response.status_code == 200:
+ data = response.json()
+ print(f"服务: {data['service']}")
+ print(f"版本: {data['version']}")
+ print("✅ 根路径正常")
+ return True
+ else:
+ print("❌ 根路径异常")
+ return False
+ except Exception as e:
+ print(f"❌ 连接失败: {str(e)}")
+ return False
+
+
+def main():
+ """主测试函数"""
+ print("\n" + "="*50)
+ print("文档提取微服务 - 测试套件")
+ print("="*50)
+
+ # 获取PDF文件路径(如果提供)
+ pdf_file = sys.argv[1] if len(sys.argv) > 1 else None
+
+ # 运行测试
+ results = []
+
+ results.append(("根路径", test_root()))
+ results.append(("健康检查", test_health()))
+ results.append(("PDF提取", test_pdf_extraction(pdf_file)))
+
+ # 总结
+ print("\n" + "="*50)
+ print("测试总结")
+ print("="*50)
+
+ for name, result in results:
+ if result is True:
+ status = "✅ 通过"
+ elif result is False:
+ status = "❌ 失败"
+ else:
+ status = "⏭️ 跳过"
+ print(f"{name}: {status}")
+
+ passed = sum(1 for _, r in results if r is True)
+ total = len([r for _, r in results if r is not None])
+
+ print(f"\n通过率: {passed}/{total}")
+
+ if passed == total:
+ print("\n🎉 所有测试通过!")
+ else:
+ print("\n⚠️ 部分测试失败")
+
+
+if __name__ == "__main__":
+ main()
+
+
+
+
+
+