feat: add extraction_service (PDF/Docx/Txt) and update .gitignore to exclude venv

2025-11-16 15:32:44 +08:00
parent 2a4f59b08b
commit 39eb62ee79
18 changed files with 2706 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -48,6 +48,78 @@ tmp/
 temp/
 *.tmp

+# ==================== Python ====================
+# Virtual environments (重要！避免提交 2+ GB 的依赖)
+venv/
+env/
+.venv/
+ENV/
+env.bak/
+venv.bak/

+# Python cache
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python

+# PyInstaller
+*.manifest
+*.spec

+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# Celery
+celerybeat-schedule
+celerybeat.pid
+
+# Environments
+.env
+.venv
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Unit test / coverage
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre
+.pyre/
+
+# pytype
+.pytype/
+
+# Cython
+cython_debug/
--- a/extraction_service/.gitignore
+++ b/extraction_service/.gitignore
@@ -0,0 +1,40 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+ENV/
+build/
+dist/
+*.egg-info/
+
+# 环境变量
+.env
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# 临时文件
+/tmp/
+*.log
+
+# 测试
+.pytest_cache/
+.coverage
+htmlcov/
+
+# OS
+.DS_Store
+Thumbs.db
+
+
+
+
+
+
--- a/extraction_service/README.md
+++ b/extraction_service/README.md
@@ -0,0 +1,181 @@
+# 文档提取微服务
+
+基于FastAPI的文档文本提取服务，支持PDF、Docx、Txt格式。
+
+## 功能特性
+
+- ✅ **PDF提取**：使用PyMuPDF快速提取PDF文本
+- ⏳ **Docx提取**：使用Mammoth提取Word文档（Day 3）
+- ⏳ **Txt提取**：支持多种编码（Day 3）
+- ⏳ **语言检测**：自动检测PDF语言（Day 2）
+- ⏳ **Nougat集成**：高质量学术PDF解析（Day 2）
+
+## 快速开始
+
+### 1. 安装依赖
+
+```bash
+cd extraction_service
+
+# 创建虚拟环境（推荐）
+python -m venv venv
+source venv/bin/activate  # Windows: venv\Scripts\activate
+
+# 安装依赖
+pip install -r requirements.txt
+```
+
+### 2. 配置环境变量
+
+```bash
+# 复制示例配置
+cp .env.example .env
+
+# 编辑配置（可选）
+# SERVICE_PORT=8000
+# DEBUG=True
+```
+
+### 3. 启动服务
+
+```bash
+# 开发模式（自动重载）
+python main.py
+
+# 或使用uvicorn
+uvicorn main:app --reload --port 8000
+```
+
+服务将在 http://localhost:8000 启动
+
+### 4. 测试服务
+
+#### 健康检查
+
+```bash
+curl http://localhost:8000/api/health
+```
+
+返回：
+```json
+{
+  "status": "healthy",
+  "checks": {
+    "pymupdf": {
+      "available": true,
+      "version": "1.23.8"
+    },
+    "temp_dir": {
+      "path": "/tmp/extraction_service",
+      "writable": true
+    }
+  }
+}
+```
+
+#### PDF文本提取
+
+```bash
+curl -X POST http://localhost:8000/api/extract/pdf \
+  -F "file=@test.pdf"
+```
+
+返回：
+```json
+{
+  "success": true,
+  "method": "pymupdf",
+  "text": "提取的文本内容...",
+  "metadata": {
+    "page_count": 20,
+    "char_count": 50000,
+    "file_size": 1024000,
+    "filename": "test.pdf"
+  }
+}
+```
+
+## API文档
+
+启动服务后访问：
+- Swagger UI: http://localhost:8000/docs
+- ReDoc: http://localhost:8000/redoc
+
+## 项目结构
+
+```
+extraction_service/
+├── main.py              # 主应用入口
+├── requirements.txt     # Python依赖
+├── .env.example         # 环境变量示例
+├── README.md           # 本文件
+├── services/           # 服务模块
+│   ├── __init__.py
+│   ├── pdf_extractor.py      # PDF提取（PyMuPDF）
+│   ├── nougat_extractor.py   # Nougat提取（Day 2）
+│   ├── docx_extractor.py     # Docx提取（Day 3）
+│   ├── txt_extractor.py      # Txt提取（Day 3）
+│   ├── language_detector.py  # 语言检测（Day 2）
+│   └── file_utils.py         # 文件工具
+└── tests/              # 测试文件（待添加）
+```
+
+## 开发计划
+
+### ✅ Day 1（已完成）
+- [x] FastAPI项目搭建
+- [x] PyMuPDF集成
+- [x] PDF文本提取功能
+- [x] 健康检查API
+
+### ⏳ Day 2（进行中）
+- [ ] 安装Nougat
+- [ ] 语言检测功能
+- [ ] Nougat提取逻辑
+- [ ] 顺序降级机制
+
+### ⏳ Day 3
+- [ ] Docx提取（Mammoth）
+- [ ] Txt提取（多编码）
+- [ ] 文件格式验证
+
+## 依赖说明
+
+| 库 | 版本 | 用途 |
+|---|---|---|
+| fastapi | 0.104.1 | Web框架 |
+| uvicorn | 0.24.0 | ASGI服务器 |
+| PyMuPDF | 1.23.8 | PDF文本提取 |
+| pdfplumber | 0.10.3 | PDF语言检测 |
+| mammoth | 1.6.0 | Docx提取 |
+| langdetect | 1.0.9 | 语言检测 |
+| loguru | 0.7.2 | 日志管理 |
+
+## 性能指标
+
+| 操作 | 目标时间 |
+|---|---|
+| 20页PDF（PyMuPDF） | <30秒 |
+| 10页Docx | <10秒 |
+| 1MB Txt | <5秒 |
+
+## 常见问题
+
+### Q: PyMuPDF安装失败？
+A: 确保Python版本>=3.8，使用pip安装：`pip install PyMuPDF`
+
+### Q: 服务无法启动？
+A: 检查端口8000是否被占用，可修改.env中的SERVICE_PORT
+
+### Q: 临时文件在哪里？
+A: 默认在/tmp/extraction_service目录，可通过TEMP_DIR环境变量配置
+
+## License
+
+MIT
+
+
+
+
+
+
--- a/extraction_service/install.bat
+++ b/extraction_service/install.bat
@@ -0,0 +1,89 @@
+@echo off
+chcp 65001 >nul
+echo ================================
+echo 安装文档提取微服务依赖
+echo ================================
+echo.
+
+REM 检查Python
+echo [1/5] 检查Python环境...
+python --version >nul 2>&1
+if errorlevel 1 (
+    echo ❌ 错误: 未找到Python
+    echo 请先安装Python 3.8或更高版本
+    echo 下载地址: https://www.python.org/downloads/
+    pause
+    exit /b 1
+)
+
+python --version
+echo ✅ Python已安装
+echo.
+
+REM 创建虚拟环境
+echo [2/5] 创建虚拟环境...
+if exist venv (
+    echo 虚拟环境已存在，跳过创建
+) else (
+    python -m venv venv
+    if errorlevel 1 (
+        echo ❌ 创建虚拟环境失败
+        pause
+        exit /b 1
+    )
+    echo ✅ 虚拟环境创建成功
+)
+echo.
+
+REM 激活虚拟环境
+echo [3/5] 激活虚拟环境...
+call venv\Scripts\activate
+if errorlevel 1 (
+    echo ❌ 激活虚拟环境失败
+    pause
+    exit /b 1
+)
+echo ✅ 虚拟环境已激活
+echo.
+
+REM 升级pip
+echo [4/5] 升级pip...
+python -m pip install --upgrade pip
+echo.
+
+REM 安装依赖
+echo [5/5] 安装依赖包...
+echo 这可能需要几分钟时间...
+pip install -r requirements.txt
+if errorlevel 1 (
+    echo ❌ 依赖安装失败
+    pause
+    exit /b 1
+)
+echo.
+
+REM 验证安装
+echo ================================
+echo 验证安装
+echo ================================
+python -c "import fastapi; print('✅ FastAPI:', fastapi.__version__)"
+python -c "import fitz; print('✅ PyMuPDF:', fitz.__version__)"
+python -c "import uvicorn; print('✅ Uvicorn: OK')"
+echo.
+
+echo ================================
+echo 🎉 安装完成！
+echo ================================
+echo.
+echo 下一步:
+echo 1. 启动服务: start.bat
+echo 2. 测试服务: python test_service.py
+echo.
+
+pause
+
+
+
+
+
+
--- a/extraction_service/install_nougat.bat
+++ b/extraction_service/install_nougat.bat
@@ -0,0 +1,88 @@
+@echo off
+chcp 65001 >nul
+echo ================================
+echo 安装Nougat OCR
+echo ================================
+echo.
+
+echo ⚠️ 注意事项：
+echo 1. Nougat需要Python 3.8+
+echo 2. 首次运行会下载模型文件（约350MB）
+echo 3. 建议使用GPU加速（需CUDA）
+echo 4. 安装可能需要5-10分钟
+echo.
+pause
+
+REM 激活虚拟环境
+if exist venv\Scripts\activate.bat (
+    echo [1/4] 激活虚拟环境...
+    call venv\Scripts\activate
+) else (
+    echo 错误: 请先运行 install.bat 创建虚拟环境
+    pause
+    exit /b 1
+)
+
+REM 安装Nougat
+echo.
+echo [2/4] 安装Nougat OCR...
+echo 这可能需要几分钟时间...
+echo.
+
+pip install nougat-ocr==0.1.17
+
+if errorlevel 1 (
+    echo.
+    echo ❌ Nougat安装失败
+    echo.
+    echo 可能的原因：
+    echo 1. 网络问题：请使用国内镜像源
+    echo 2. Python版本：需要Python 3.8+
+    echo 3. 依赖冲突：可能需要新的虚拟环境
+    echo.
+    echo 替代方案：
+    echo - 如果只使用中文PDF，可以不安装Nougat
+    echo - 系统会自动降级使用PyMuPDF
+    echo.
+    pause
+    exit /b 1
+)
+
+echo.
+echo [3/4] 验证安装...
+python -c "import nougat; print('✅ Nougat导入成功')"
+
+echo.
+echo [4/4] 测试Nougat命令...
+nougat --version
+
+if errorlevel 1 (
+    echo ⚠️ 命令行工具未找到，但Python模块已安装
+    echo 这可能不影响使用，系统会尝试直接调用Python模块
+) else (
+    echo ✅ Nougat命令行工具正常
+)
+
+echo.
+echo ================================
+echo 🎉 Nougat安装完成！
+echo ================================
+echo.
+echo 说明：
+echo - Nougat擅长处理英文学术PDF
+echo - 能保留表格、公式等结构
+echo - 中文PDF会自动使用PyMuPDF
+echo - 首次使用会下载模型（约350MB）
+echo.
+echo 下一步：
+echo - 启动服务: start.bat
+echo - 健康检查: curl http://localhost:8000/api/health
+echo.
+
+pause
+
+
+
+
+
+
--- a/extraction_service/main.py
+++ b/extraction_service/main.py
@@ -0,0 +1,508 @@
+"""
+文档提取微服务 - 主入口
+
+功能：
+- PDF文本提取（PyMuPDF）
+- Docx文本提取（Mammoth）
+- Txt文本提取（直接读取）
+- 语言检测
+- 健康检查
+"""
+
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from loguru import logger
+from pathlib import Path
+import os
+import sys
+from datetime import datetime
+from dotenv import load_dotenv
+
+# 加载环境变量
+load_dotenv()
+
+# 配置日志
+logger.remove()
+logger.add(
+    sys.stdout,
+    format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
+    level=os.getenv("LOG_LEVEL", "INFO")
+)
+
+# 创建FastAPI应用
+app = FastAPI(
+    title="文档提取微服务",
+    description="提供PDF、Docx、Txt文档的文本提取服务",
+    version="1.0.0",
+)
+
+# CORS配置
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # 生产环境应该限制具体域名
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# 临时文件目录
+TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service"))
+TEMP_DIR.mkdir(parents=True, exist_ok=True)
+
+# 导入服务模块
+from services.pdf_extractor import extract_pdf_pymupdf
+from services.pdf_processor import extract_pdf, get_pdf_processing_strategy
+from services.language_detector import detect_language, detect_language_detailed
+from services.nougat_extractor import check_nougat_available, get_nougat_info
+from services.file_utils import detect_file_type, cleanup_temp_file
+from services.docx_extractor import extract_docx_mammoth, validate_docx_file
+from services.txt_extractor import extract_txt, validate_txt_file
+
+
+# ==================== API路由 ====================
+
+@app.get("/")
+async def root():
+    """根路径"""
+    return {
+        "service": "文档提取微服务",
+        "version": "1.0.0",
+        "status": "running"
+    }
+
+
+@app.get("/api/health")
+async def health_check():
+    """
+    健康检查接口
+    
+    检查项：
+    - 服务是否运行
+    - PyMuPDF是否可用
+    - Nougat是否可用
+    - 临时目录是否可写
+    """
+    try:
+        import fitz  # PyMuPDF
+        pymupdf_version = fitz.__version__
+        pymupdf_available = True
+    except Exception as e:
+        pymupdf_version = "unknown"
+        pymupdf_available = False
+        logger.warning(f"PyMuPDF不可用: {str(e)}")
+    
+    # 检查Nougat
+    nougat_info = get_nougat_info()
+    
+    # 检查临时目录
+    temp_dir_writable = TEMP_DIR.exists() and os.access(TEMP_DIR, os.W_OK)
+    
+    return {
+        "status": "healthy" if (pymupdf_available and temp_dir_writable) else "degraded",
+        "checks": {
+            "pymupdf": {
+                "available": pymupdf_available,
+                "version": pymupdf_version
+            },
+            "nougat": nougat_info,
+            "temp_dir": {
+                "path": str(TEMP_DIR),
+                "writable": temp_dir_writable
+            }
+        },
+        "timestamp": datetime.now().isoformat()
+    }
+
+
+@app.post("/api/extract/pdf")
+async def extract_pdf_endpoint(
+    file: UploadFile = File(...),
+    method: str = "auto"
+):
+    """
+    PDF文本提取接口（智能选择方法）
+    
+    Args:
+        file: 上传的PDF文件
+        method: 提取方法 ('auto' | 'nougat' | 'pymupdf')
+            - auto: 自动选择（默认）
+            - nougat: 强制使用Nougat
+            - pymupdf: 强制使用PyMuPDF
+    
+    Returns:
+        {
+            "success": true,
+            "method": "nougat" | "pymupdf",
+            "reason": "...",
+            "text": "提取的文本内容",
+            "metadata": {...}
+        }
+    """
+    temp_path = None
+    
+    try:
+        # 验证文件类型
+        if not file.filename.lower().endswith('.pdf'):
+            raise HTTPException(
+                status_code=400,
+                detail="文件格式错误，只支持PDF文件"
+            )
+        
+        # 保存临时文件
+        temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
+        
+        logger.info(f"开始处理PDF文件: {file.filename}, 方法={method}")
+        
+        with open(temp_path, "wb") as f:
+            content = await file.read()
+            f.write(content)
+        
+        file_size = len(content)
+        logger.info(f"文件大小: {file_size / 1024:.2f} KB")
+        
+        # 提取文本（使用顺序降级策略）
+        force_method = None if method == "auto" else method
+        result = extract_pdf(str(temp_path), force_method=force_method)
+        
+        if not result["success"]:
+            raise HTTPException(
+                status_code=500,
+                detail=f"PDF提取失败: {result.get('error', 'Unknown error')}"
+            )
+        
+        # 添加文件元数据
+        result["metadata"]["file_size"] = file_size
+        result["metadata"]["filename"] = file.filename
+        
+        logger.info(f"PDF提取成功: {file.filename}, "
+                   f"方法={result['method']}, "
+                   f"原因={result.get('reason', 'N/A')}")
+        
+        return JSONResponse(content=result)
+    
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"PDF提取失败: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"处理失败: {str(e)}"
+        )
+    
+    finally:
+        # 清理临时文件
+        if temp_path:
+            cleanup_temp_file(temp_path)
+
+
+@app.post("/api/detect-language")
+async def detect_language_endpoint(file: UploadFile = File(...)):
+    """
+    PDF语言检测接口
+    
+    Args:
+        file: 上传的PDF文件
+    
+    Returns:
+        {
+            "language": "chinese" | "english" | "mixed",
+            "chinese_ratio": 0.65,
+            "chinese_chars": 3500,
+            "total_chars": 5000
+        }
+    """
+    temp_path = None
+    
+    try:
+        if not file.filename.lower().endswith('.pdf'):
+            raise HTTPException(status_code=400, detail="只支持PDF文件")
+        
+        # 保存临时文件
+        temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
+        
+        with open(temp_path, "wb") as f:
+            content = await file.read()
+            f.write(content)
+        
+        # 检测语言
+        result = detect_language_detailed(str(temp_path))
+        result["filename"] = file.filename
+        
+        return JSONResponse(content=result)
+    
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"语言检测失败: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"检测失败: {str(e)}")
+    
+    finally:
+        if temp_path:
+            cleanup_temp_file(temp_path)
+
+
+@app.post("/api/pdf-strategy")
+async def get_strategy_endpoint(file: UploadFile = File(...)):
+    """
+    获取PDF处理策略（不实际提取）
+    
+    Args:
+        file: 上传的PDF文件
+    
+    Returns:
+        {
+            "detected_language": "chinese" | "english",
+            "recommended_method": "nougat" | "pymupdf",
+            "reason": "...",
+            "nougat_available": true
+        }
+    """
+    temp_path = None
+    
+    try:
+        if not file.filename.lower().endswith('.pdf'):
+            raise HTTPException(status_code=400, detail="只支持PDF文件")
+        
+        temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
+        
+        with open(temp_path, "wb") as f:
+            content = await file.read()
+            f.write(content)
+        
+        # 获取处理策略
+        result = get_pdf_processing_strategy(str(temp_path))
+        result["filename"] = file.filename
+        
+        return JSONResponse(content=result)
+    
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"获取策略失败: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"失败: {str(e)}")
+    
+    finally:
+        if temp_path:
+            cleanup_temp_file(temp_path)
+
+
+@app.post("/api/extract/docx")
+async def extract_docx_endpoint(file: UploadFile = File(...)):
+    """
+    Docx文档提取接口
+    
+    Args:
+        file: 上传的Docx文件
+    
+    Returns:
+        {
+            "success": true,
+            "method": "mammoth",
+            "text": "提取的文本内容",
+            "metadata": {
+                "char_count": 字符数,
+                "has_tables": 是否包含表格,
+                "file_size": 文件大小
+            }
+        }
+    """
+    temp_path = None
+    
+    try:
+        # 验证文件类型
+        if not file.filename.lower().endswith('.docx'):
+            raise HTTPException(
+                status_code=400,
+                detail="文件格式错误，只支持Docx文件"
+            )
+        
+        # 保存临时文件
+        temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
+        
+        logger.info(f"开始处理Docx文件: {file.filename}")
+        
+        with open(temp_path, "wb") as f:
+            content = await file.read()
+            f.write(content)
+        
+        file_size = len(content)
+        logger.info(f"文件大小: {file_size / 1024:.2f} KB")
+        
+        # 提取文本
+        result = extract_docx_mammoth(str(temp_path))
+        
+        if not result["success"]:
+            raise HTTPException(
+                status_code=500,
+                detail=f"Docx提取失败: {result.get('error', 'Unknown error')}"
+            )
+        
+        # 添加文件元数据
+        result["method"] = "mammoth"
+        result["metadata"]["filename"] = file.filename
+        
+        logger.info(f"Docx提取成功: {file.filename}, "
+                   f"字符数={result['metadata']['char_count']}")
+        
+        return JSONResponse(content=result)
+    
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Docx提取失败: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"处理失败: {str(e)}"
+        )
+    
+    finally:
+        if temp_path:
+            cleanup_temp_file(temp_path)
+
+
+@app.post("/api/extract/txt")
+async def extract_txt_endpoint(file: UploadFile = File(...)):
+    """
+    Txt文本文件提取接口
+    
+    Args:
+        file: 上传的Txt文件
+    
+    Returns:
+        {
+            "success": true,
+            "method": "direct",
+            "text": "文本内容",
+            "encoding": "utf-8",
+            "metadata": {
+                "char_count": 字符数,
+                "line_count": 行数,
+                "file_size": 文件大小
+            }
+        }
+    """
+    temp_path = None
+    
+    try:
+        # 验证文件类型
+        if not file.filename.lower().endswith('.txt'):
+            raise HTTPException(
+                status_code=400,
+                detail="文件格式错误，只支持Txt文件"
+            )
+        
+        # 保存临时文件
+        temp_path = TEMP_DIR / f"temp_{os.getpid()}_{file.filename}"
+        
+        logger.info(f"开始处理Txt文件: {file.filename}")
+        
+        with open(temp_path, "wb") as f:
+            content = await file.read()
+            f.write(content)
+        
+        file_size = len(content)
+        logger.info(f"文件大小: {file_size / 1024:.2f} KB")
+        
+        # 提取文本
+        result = extract_txt(str(temp_path))
+        
+        if not result["success"]:
+            raise HTTPException(
+                status_code=500,
+                detail=f"Txt提取失败: {result.get('error', 'Unknown error')}"
+            )
+        
+        # 添加方法标识和文件名
+        result["method"] = "direct"
+        result["metadata"]["filename"] = file.filename
+        
+        logger.info(f"Txt提取成功: {file.filename}, "
+                   f"编码={result['encoding']}, "
+                   f"字符数={result['metadata']['char_count']}")
+        
+        return JSONResponse(content=result)
+    
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Txt提取失败: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"处理失败: {str(e)}"
+        )
+    
+    finally:
+        if temp_path:
+            cleanup_temp_file(temp_path)
+
+
+@app.post("/api/extract")
+async def extract_document(
+    file: UploadFile = File(...),
+    file_type: str = None
+):
+    """
+    通用文档提取接口
+    
+    自动检测文件类型并调用相应的提取方法
+    
+    Args:
+        file: 上传的文件
+        file_type: 可选，指定文件类型 ('pdf' | 'docx' | 'txt')
+    
+    Returns:
+        提取结果
+    """
+    try:
+        # 自动检测文件类型
+        if not file_type:
+            file_type = detect_file_type(file.filename)
+        
+        logger.info(f"文件类型: {file_type}, 文件名: {file.filename}")
+        
+        # 根据类型调用不同的处理函数
+        if file_type == 'pdf':
+            return await extract_pdf_endpoint(file)
+        elif file_type == 'docx':
+            return await extract_docx_endpoint(file)
+        elif file_type == 'txt':
+            return await extract_txt_endpoint(file)
+        else:
+            raise HTTPException(
+                status_code=400,
+                detail=f"不支持的文件格式: {file_type}，仅支持PDF、Docx、Txt"
+            )
+    
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"文档提取失败: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"处理失败: {str(e)}"
+        )
+
+
+# ==================== 启动配置 ====================
+
+if __name__ == "__main__":
+    import uvicorn
+    
+    port = int(os.getenv("SERVICE_PORT", 8000))
+    host = os.getenv("SERVICE_HOST", "0.0.0.0")
+    debug = os.getenv("DEBUG", "True").lower() == "true"
+    
+    logger.info(f"启动文档提取微服务...")
+    logger.info(f"地址: http://{host}:{port}")
+    logger.info(f"健康检查: http://{host}:{port}/api/health")
+    logger.info(f"调试模式: {debug}")
+    
+    uvicorn.run(
+        "main:app",
+        host=host,
+        port=port,
+        reload=debug,
+        log_level="info"
+    )
+
--- a/extraction_service/requirements.txt
+++ b/extraction_service/requirements.txt
@@ -0,0 +1,31 @@
+# FastAPI核心依赖
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+python-multipart==0.0.6
+
+# PDF处理
+PyMuPDF>=1.24.0  # 使用更新版本，有预编译wheel
+pdfplumber==0.10.3
+nougat-ocr==0.1.17  # 学术PDF高质量提取（英文）
+albumentations==1.3.1  # Nougat兼容版本（不要升级到2.x）
+
+# Docx处理（Day 3需要）
+mammoth==1.6.0
+python-docx==1.1.0
+
+# 语言检测（Day 2需要）
+langdetect==1.0.9
+
+# 编码检测（Day 3需要）
+chardet==5.2.0
+
+# 工具
+python-dotenv==1.0.0
+pydantic>=2.10.0  # 使用更新版本，有预编译wheel
+
+# 日志
+loguru==0.7.2
+
+# 测试工具
+requests==2.31.0
+
--- a/extraction_service/services/init.py
+++ b/extraction_service/services/init.py
@@ -0,0 +1,11 @@
+"""
+服务模块
+
+包含各种文档提取和处理服务
+"""
+
+
+
+
+
+
--- a/extraction_service/services/docx_extractor.py
+++ b/extraction_service/services/docx_extractor.py
@@ -0,0 +1,257 @@
+"""
+Docx文档提取服务
+
+使用Mammoth库提取Word文档文本
+支持.docx格式（不支持老版.doc）
+"""
+
+import mammoth
+from pathlib import Path
+from typing import Dict, Any
+from loguru import logger
+
+
+def extract_docx_mammoth(file_path: str) -> Dict[str, Any]:
+    """
+    使用Mammoth提取Docx文本
+    
+    Mammoth特点:
+    - 转换为纯文本或HTML
+    - 保留基本格式信息
+    - 处理表格、列表等结构
+    
+    Args:
+        file_path: Docx文件路径
+    
+    Returns:
+        {
+            "success": True,
+            "text": "提取的文本内容",
+            "format": "plain_text",
+            "metadata": {
+                "char_count": 字符数,
+                "has_tables": 是否包含表格,
+                "file_size": 文件大小
+            }
+        }
+    """
+    try:
+        file_path_obj = Path(file_path)
+        
+        # 验证文件存在
+        if not file_path_obj.exists():
+            return {
+                "success": False,
+                "error": f"文件不存在: {file_path}",
+                "text": "",
+                "metadata": {}
+            }
+        
+        # 验证文件格式
+        if file_path_obj.suffix.lower() != '.docx':
+            return {
+                "success": False,
+                "error": f"不支持的文件格式: {file_path_obj.suffix}，仅支持.docx",
+                "text": "",
+                "metadata": {}
+            }
+        
+        logger.info(f"开始提取Docx文件: {file_path_obj.name}")
+        
+        # 使用Mammoth提取纯文本
+        with open(file_path, "rb") as docx_file:
+            result = mammoth.extract_raw_text(docx_file)
+            text = result.value  # 提取的文本
+            messages = result.messages  # 警告/错误信息
+        
+        # 检查是否有警告
+        if messages:
+            logger.warning(f"Mammoth提取警告: {len(messages)}个")
+            for msg in messages:
+                logger.debug(f"  - {msg.type}: {msg.message}")
+        
+        # 简单的质量检查
+        char_count = len(text)
+        if char_count == 0:
+            logger.warning("提取的文本为空")
+            return {
+                "success": False,
+                "error": "文档内容为空或无法提取",
+                "text": "",
+                "metadata": {
+                    "char_count": 0,
+                    "file_size": file_path_obj.stat().st_size
+                }
+            }
+        
+        # 简单判断是否包含表格（通过制表符或特殊结构）
+        has_tables = '\t' in text or '|' in text
+        
+        logger.info(f"Docx提取成功: {char_count}个字符")
+        
+        return {
+            "success": True,
+            "text": text,
+            "format": "plain_text",
+            "metadata": {
+                "char_count": char_count,
+                "has_tables": has_tables,
+                "file_size": file_path_obj.stat().st_size,
+                "warnings": len(messages)
+            }
+        }
+    
+    except Exception as e:
+        logger.error(f"Docx提取失败: {str(e)}")
+        return {
+            "success": False,
+            "error": str(e),
+            "text": "",
+            "metadata": {}
+        }
+
+
+def extract_docx_html(file_path: str) -> Dict[str, Any]:
+    """
+    使用Mammoth提取Docx为HTML格式（保留更多格式）
+    
+    Args:
+        file_path: Docx文件路径
+    
+    Returns:
+        {
+            "success": True,
+            "html": "HTML格式的文本",
+            "format": "html",
+            "metadata": {...}
+        }
+    """
+    try:
+        file_path_obj = Path(file_path)
+        
+        if not file_path_obj.exists():
+            return {
+                "success": False,
+                "error": f"文件不存在: {file_path}",
+                "html": "",
+                "metadata": {}
+            }
+        
+        logger.info(f"开始提取Docx为HTML: {file_path_obj.name}")
+        
+        # 提取为HTML
+        with open(file_path, "rb") as docx_file:
+            result = mammoth.convert_to_html(docx_file)
+            html = result.value
+            messages = result.messages
+        
+        if messages:
+            logger.warning(f"HTML转换警告: {len(messages)}个")
+        
+        logger.info(f"HTML提取成功: {len(html)}个字符")
+        
+        return {
+            "success": True,
+            "html": html,
+            "format": "html",
+            "metadata": {
+                "html_length": len(html),
+                "file_size": file_path_obj.stat().st_size,
+                "warnings": len(messages)
+            }
+        }
+    
+    except Exception as e:
+        logger.error(f"HTML提取失败: {str(e)}")
+        return {
+            "success": False,
+            "error": str(e),
+            "html": "",
+            "metadata": {}
+        }
+
+
+def validate_docx_file(file_path: str) -> Dict[str, Any]:
+    """
+    验证Docx文件的有效性
+    
+    Args:
+        file_path: 文件路径
+    
+    Returns:
+        {
+            "valid": True/False,
+            "reason": "原因",
+            "file_info": {文件信息}
+        }
+    """
+    try:
+        file_path_obj = Path(file_path)
+        
+        # 检查文件存在
+        if not file_path_obj.exists():
+            return {
+                "valid": False,
+                "reason": "文件不存在"
+            }
+        
+        # 检查后缀
+        if file_path_obj.suffix.lower() != '.docx':
+            return {
+                "valid": False,
+                "reason": f"不支持的格式: {file_path_obj.suffix}（仅支持.docx）"
+            }
+        
+        # 检查文件大小（限制50MB）
+        file_size = file_path_obj.stat().st_size
+        max_size = 50 * 1024 * 1024  # 50MB
+        
+        if file_size > max_size:
+            return {
+                "valid": False,
+                "reason": f"文件过大: {file_size / 1024 / 1024:.2f}MB（限制50MB）"
+            }
+        
+        if file_size == 0:
+            return {
+                "valid": False,
+                "reason": "文件为空"
+            }
+        
+        # 尝试打开文件（基本有效性检查）
+        try:
+            with open(file_path, "rb") as f:
+                # 读取前4个字节检查ZIP签名（docx本质是ZIP文件）
+                signature = f.read(4)
+                if signature != b'PK\x03\x04':
+                    return {
+                        "valid": False,
+                        "reason": "不是有效的Docx文件（ZIP签名错误）"
+                    }
+        except Exception as e:
+            return {
+                "valid": False,
+                "reason": f"无法读取文件: {str(e)}"
+            }
+        
+        return {
+            "valid": True,
+            "reason": "文件有效",
+            "file_info": {
+                "filename": file_path_obj.name,
+                "size": file_size,
+                "size_mb": round(file_size / 1024 / 1024, 2)
+            }
+        }
+    
+    except Exception as e:
+        return {
+            "valid": False,
+            "reason": f"验证失败: {str(e)}"
+        }
+
+
+
+
+
+
--- a/extraction_service/services/file_utils.py
+++ b/extraction_service/services/file_utils.py
@@ -0,0 +1,88 @@
+"""
+文件工具函数
+"""
+
+import os
+from pathlib import Path
+from loguru import logger
+
+
+def detect_file_type(filename: str) -> str:
+    """
+    根据文件名检测文件类型
+    
+    Args:
+        filename: 文件名
+    
+    Returns:
+        文件类型: 'pdf' | 'docx' | 'txt'
+    
+    Raises:
+        ValueError: 不支持的文件格式
+    """
+    ext = filename.lower().split('.')[-1]
+    
+    if ext == 'pdf':
+        return 'pdf'
+    elif ext == 'docx':
+        return 'docx'
+    elif ext == 'txt':
+        return 'txt'
+    else:
+        raise ValueError(f"不支持的文件格式: .{ext}")
+
+
+def cleanup_temp_file(file_path: Path | str) -> None:
+    """
+    清理临时文件
+    
+    Args:
+        file_path: 文件路径
+    """
+    try:
+        if isinstance(file_path, str):
+            file_path = Path(file_path)
+        
+        if file_path.exists():
+            file_path.unlink()
+            logger.debug(f"清理临时文件: {file_path}")
+    except Exception as e:
+        logger.warning(f"清理临时文件失败: {str(e)}")
+
+
+def get_file_size_mb(file_path: Path | str) -> float:
+    """
+    获取文件大小（MB）
+    
+    Args:
+        file_path: 文件路径
+    
+    Returns:
+        文件大小（MB）
+    """
+    if isinstance(file_path, str):
+        file_path = Path(file_path)
+    
+    if file_path.exists():
+        return file_path.stat().st_size / (1024 * 1024)
+    return 0.0
+
+
+def validate_file_size(file_size: int, max_size: int = 52428800) -> bool:
+    """
+    验证文件大小
+    
+    Args:
+        file_size: 文件大小（字节）
+        max_size: 最大允许大小（字节），默认50MB
+    
+    Returns:
+        是否通过验证
+    """
+    return file_size <= max_size
+
+
+
+
+
+
--- a/extraction_service/services/language_detector.py
+++ b/extraction_service/services/language_detector.py
@@ -0,0 +1,160 @@
+"""
+语言检测服务
+
+检测PDF文档的主要语言（中文/英文/混合）
+用于决定使用哪种提取方法
+"""
+
+import pdfplumber
+from typing import Dict, Any
+from loguru import logger
+
+
+def detect_language(pdf_path: str) -> str:
+    """
+    检测PDF主要语言
+    
+    策略：
+    1. 提取前3页文本（代表性强）
+    2. 统计中文字符比例
+    3. 判断语言类型
+    
+    Args:
+        pdf_path: PDF文件路径
+    
+    Returns:
+        'chinese' | 'english' | 'mixed'
+    """
+    try:
+        logger.info(f"开始语言检测: {pdf_path}")
+        
+        with pdfplumber.open(pdf_path) as pdf:
+            # 提取前3页文本（或全部如果少于3页）
+            sample_pages = min(3, len(pdf.pages))
+            sample_text = ""
+            
+            for i in range(sample_pages):
+                try:
+                    page_text = pdf.pages[i].extract_text()
+                    if page_text:
+                        sample_text += page_text + "\n"
+                except Exception as e:
+                    logger.warning(f"第{i+1}页文本提取失败: {str(e)}")
+                    continue
+        
+        # 检查是否有足够文本
+        if len(sample_text.strip()) < 100:
+            logger.warning("文本太少，默认使用英文处理")
+            return 'english'
+        
+        # 统计中文字符比例
+        chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff'])
+        total_chars = len([c for c in sample_text if c.strip()])
+        
+        if total_chars == 0:
+            logger.warning("无有效字符，默认使用英文处理")
+            return 'english'
+        
+        chinese_ratio = chinese_chars / total_chars
+        
+        logger.info(f"中文字符比例: {chinese_ratio:.2%} ({chinese_chars}/{total_chars})")
+        
+        # 判断语言
+        # 阈值说明：
+        # - > 30%: 判定为中文PDF（包括中英混合但中文为主）
+        # - <= 30%: 判定为英文PDF
+        if chinese_ratio > 0.3:
+            language = 'chinese'
+        else:
+            language = 'english'
+        
+        logger.info(f"检测结果: {language}")
+        return language
+    
+    except Exception as e:
+        logger.error(f"语言检测失败: {str(e)}，默认使用英文处理")
+        return 'english'
+
+
+def detect_language_detailed(pdf_path: str) -> Dict[str, Any]:
+    """
+    详细的语言检测
+    
+    返回更多统计信息
+    
+    Args:
+        pdf_path: PDF文件路径
+    
+    Returns:
+        {
+            "language": "chinese" | "english" | "mixed",
+            "chinese_ratio": 0.65,
+            "chinese_chars": 3500,
+            "total_chars": 5000,
+            "sample_pages": 3,
+            "sample_text_length": 5000
+        }
+    """
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            sample_pages = min(3, len(pdf.pages))
+            sample_text = ""
+            
+            for i in range(sample_pages):
+                try:
+                    page_text = pdf.pages[i].extract_text()
+                    if page_text:
+                        sample_text += page_text + "\n"
+                except:
+                    continue
+        
+        # 统计
+        chinese_chars = len([c for c in sample_text if '\u4e00' <= c <= '\u9fff'])
+        total_chars = len([c for c in sample_text if c.strip()])
+        
+        chinese_ratio = chinese_chars / total_chars if total_chars > 0 else 0
+        
+        # 判断语言
+        if chinese_ratio > 0.3:
+            language = 'chinese'
+        elif chinese_ratio > 0.1:
+            language = 'mixed'
+        else:
+            language = 'english'
+        
+        return {
+            "language": language,
+            "chinese_ratio": round(chinese_ratio, 4),
+            "chinese_chars": chinese_chars,
+            "total_chars": total_chars,
+            "sample_pages": sample_pages,
+            "sample_text_length": len(sample_text)
+        }
+    
+    except Exception as e:
+        logger.error(f"详细语言检测失败: {str(e)}")
+        return {
+            "language": "english",
+            "error": str(e)
+        }
+
+
+def is_chinese_pdf(pdf_path: str, threshold: float = 0.3) -> bool:
+    """
+    简单判断是否为中文PDF
+    
+    Args:
+        pdf_path: PDF文件路径
+        threshold: 中文字符比例阈值，默认30%
+    
+    Returns:
+        True if 中文字符比例 > threshold
+    """
+    language = detect_language(pdf_path)
+    return language == 'chinese'
+
+
+
+
+
+
--- a/extraction_service/services/nougat_extractor.py
+++ b/extraction_service/services/nougat_extractor.py
@@ -0,0 +1,241 @@
+"""
+Nougat提取服务
+
+使用Nougat OCR提取学术PDF的高质量文本
+保留表格、公式等结构信息
+"""
+
+import subprocess
+import os
+from pathlib import Path
+from typing import Dict, Any, Optional, Callable
+from loguru import logger
+
+
+def check_nougat_available() -> bool:
+    """
+    检查Nougat是否已安装
+    
+    Returns:
+        True if Nougat可用
+    """
+    try:
+        # 方法1: 尝试导入nougat模块
+        import nougat
+        logger.info(f"Nougat module is available (version: {getattr(nougat, '__version__', 'unknown')})")
+        return True
+    except ImportError:
+        logger.warning("Nougat module not found")
+        return False
+    except Exception as e:
+        logger.error(f"检查Nougat失败: {str(e)}")
+        return False
+
+
+def extract_pdf_nougat(
+    file_path: str,
+    output_dir: Optional[str] = None,
+    progress_callback: Optional[Callable[[int, int], None]] = None
+) -> Dict[str, Any]:
+    """
+    使用Nougat提取PDF文本
+    
+    Args:
+        file_path: PDF文件路径
+        output_dir: 输出目录，默认为临时目录
+        progress_callback: 进度回调函数 (current_page, total_pages)
+    
+    Returns:
+        {
+            "success": True,
+            "method": "nougat",
+            "text": "提取的Markdown文本",
+            "format": "markdown",
+            "metadata": {
+                "page_count": 20,
+                "char_count": 50000,
+                "quality_score": 0.95,
+                "has_tables": True,
+                "has_formulas": True
+            }
+        }
+    """
+    try:
+        # 检查Nougat是否可用
+        if not check_nougat_available():
+            raise Exception("Nougat未安装，请先安装：pip install nougat-ocr")
+        
+        logger.info(f"开始使用Nougat提取: {file_path}")
+        
+        # 准备输出目录
+        if output_dir is None:
+            output_dir = os.path.join(os.path.dirname(file_path), "nougat_output")
+        
+        Path(output_dir).mkdir(parents=True, exist_ok=True)
+        
+        # 构建Nougat命令
+        # nougat命令格式：nougat <pdf_path> -o <output_dir>
+        cmd = [
+            'nougat',
+            file_path,
+            '-o', output_dir,
+            '--markdown',  # 输出Markdown格式
+            '--no-skipping'  # 不跳过任何页面
+        ]
+        
+        logger.info(f"执行命令: {' '.join(cmd)}")
+        
+        # 执行Nougat
+        # 注意：Nougat可能需要较长时间（1-2分钟/20页）
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True
+        )
+        
+        # 等待完成
+        stdout, stderr = process.communicate(timeout=300)  # 5分钟超时
+        
+        if process.returncode != 0:
+            logger.error(f"Nougat执行失败: {stderr}")
+            raise Exception(f"Nougat执行失败: {stderr}")
+        
+        # 读取输出文件
+        # Nougat会生成 <filename>.mmd 文件
+        pdf_name = Path(file_path).stem
+        output_file = Path(output_dir) / f"{pdf_name}.mmd"
+        
+        if not output_file.exists():
+            raise Exception(f"Nougat输出文件不存在: {output_file}")
+        
+        with open(output_file, 'r', encoding='utf-8') as f:
+            markdown_text = f.read()
+        
+        # 评估质量
+        quality_result = evaluate_nougat_quality(markdown_text)
+        
+        logger.info(f"Nougat提取完成: 质量={quality_result['quality_score']:.2f}")
+        
+        return {
+            "success": True,
+            "method": "nougat",
+            "text": markdown_text,
+            "format": "markdown",
+            "metadata": {
+                "char_count": len(markdown_text),
+                "quality_score": quality_result['quality_score'],
+                "has_tables": quality_result['has_tables'],
+                "has_formulas": quality_result['has_formulas'],
+                "has_structure": quality_result['has_structure']
+            }
+        }
+    
+    except subprocess.TimeoutExpired:
+        logger.error("Nougat处理超时（>5分钟）")
+        return {
+            "success": False,
+            "error": "处理超时",
+            "method": "nougat"
+        }
+    
+    except Exception as e:
+        logger.error(f"Nougat提取失败: {str(e)}")
+        return {
+            "success": False,
+            "error": str(e),
+            "method": "nougat"
+        }
+
+
+def evaluate_nougat_quality(text: str) -> Dict[str, Any]:
+    """
+    评估Nougat提取质量
+    
+    评分标准：
+    - 基础分：0.5
+    - 有章节结构：+0.2
+    - 有表格：+0.15
+    - 有公式：+0.15
+    - 文本长度充足：+0.1
+    - 乱码检测：-0.3
+    
+    Args:
+        text: Nougat提取的Markdown文本
+    
+    Returns:
+        {
+            "quality_score": 0.92,
+            "has_structure": True,
+            "has_tables": True,
+            "has_formulas": True,
+            "has_garbled": False
+        }
+    """
+    score = 0.5  # 基础分
+    
+    # 检查章节结构（Markdown标题）
+    has_structure = bool(text.count('##') >= 2 or text.count('#') >= 3)
+    if has_structure:
+        score += 0.2
+    
+    # 检查表格
+    has_tables = '|' in text and '---' in text
+    if has_tables:
+        score += 0.15
+    
+    # 检查公式（LaTeX格式）
+    has_formulas = '$$' in text or '$' in text or '\\(' in text
+    if has_formulas:
+        score += 0.15
+    
+    # 检查文本长度
+    if len(text) > 5000:  # 至少5000字符
+        score += 0.1
+    
+    # 检查乱码（简单启发式）
+    # 大量重复字符或特殊符号可能表示乱码
+    garbled_chars = sum(1 for c in text if ord(c) > 65535 or c in '<EFBFBD><EFBFBD>')
+    has_garbled = garbled_chars > len(text) * 0.05  # 超过5%
+    if has_garbled:
+        score -= 0.3
+    
+    # 确保分数在0-1之间
+    score = max(0.0, min(1.0, score))
+    
+    return {
+        "quality_score": score,
+        "has_structure": has_structure,
+        "has_tables": has_tables,
+        "has_formulas": has_formulas,
+        "has_garbled": has_garbled
+    }
+
+
+def get_nougat_info() -> Dict[str, Any]:
+    """
+    获取Nougat信息
+    
+    Returns:
+        Nougat版本和状态信息
+    """
+    try:
+        import nougat
+        version = getattr(nougat, '__version__', 'unknown')
+        return {
+            "available": True,
+            "version": version
+        }
+    
+    except ImportError:
+        return {
+            "available": False,
+            "error": "Nougat未安装"
+        }
+    
+    except Exception as e:
+        return {
+            "available": False,
+            "error": str(e)
+        }
+
--- a/extraction_service/services/pdf_extractor.py
+++ b/extraction_service/services/pdf_extractor.py
@@ -0,0 +1,191 @@
+"""
+PDF文本提取服务
+
+使用PyMuPDF (fitz)提取PDF文本内容
+"""
+
+import fitz  # PyMuPDF
+from typing import Dict, Any
+from loguru import logger
+
+
+def extract_pdf_pymupdf(file_path: str) -> Dict[str, Any]:
+    """
+    使用PyMuPDF提取PDF文本
+    
+    Args:
+        file_path: PDF文件路径
+    
+    Returns:
+        {
+            "success": True,
+            "method": "pymupdf",
+            "text": "提取的文本",
+            "metadata": {
+                "page_count": 20,
+                "char_count": 50000,
+                "has_text": True
+            }
+        }
+    """
+    try:
+        logger.info(f"开始使用PyMuPDF提取: {file_path}")
+        
+        # 打开PDF
+        doc = fitz.open(file_path)
+        page_count = len(doc)
+        
+        logger.info(f"PDF页数: {page_count}")
+        
+        # 提取所有页面的文本
+        text_parts = []
+        
+        for page_num in range(page_count):
+            try:
+                page = doc[page_num]
+                text = page.get_text()
+                
+                if text.strip():
+                    # 添加页面分隔符
+                    text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n")
+                    text_parts.append(text)
+                    
+                    logger.debug(f"第 {page_num + 1} 页提取了 {len(text)} 个字符")
+            
+            except Exception as e:
+                logger.warning(f"第 {page_num + 1} 页提取失败: {str(e)}")
+                continue
+        
+        # 合并文本
+        full_text = "".join(text_parts)
+        char_count = len(full_text)
+        
+        # 关闭文档
+        doc.close()
+        
+        # 检查是否提取到文本
+        has_text = char_count > 100  # 至少要有100个字符
+        
+        if not has_text:
+            logger.warning(f"PDF可能是扫描版或无文本内容")
+        
+        logger.info(f"PyMuPDF提取完成: 字符数={char_count}")
+        
+        return {
+            "success": True,
+            "method": "pymupdf",
+            "text": full_text,
+            "format": "plain_text",
+            "metadata": {
+                "page_count": page_count,
+                "char_count": char_count,
+                "has_text": has_text
+            }
+        }
+    
+    except Exception as e:
+        logger.error(f"PyMuPDF提取失败: {str(e)}")
+        return {
+            "success": False,
+            "error": str(e),
+            "method": "pymupdf"
+        }
+
+
+def extract_pdf_with_layout(file_path: str) -> Dict[str, Any]:
+    """
+    使用PyMuPDF提取PDF文本（保留布局）
+    
+    Args:
+        file_path: PDF文件路径
+    
+    Returns:
+        提取结果
+    """
+    try:
+        logger.info(f"开始使用PyMuPDF提取（保留布局）: {file_path}")
+        
+        doc = fitz.open(file_path)
+        page_count = len(doc)
+        
+        text_parts = []
+        
+        for page_num in range(page_count):
+            try:
+                page = doc[page_num]
+                
+                # 使用dict模式提取，可以保留更多格式信息
+                blocks = page.get_text("dict")["blocks"]
+                
+                page_text = []
+                
+                for block in blocks:
+                    if block["type"] == 0:  # 文本块
+                        for line in block.get("lines", []):
+                            for span in line.get("spans", []):
+                                text = span.get("text", "")
+                                if text.strip():
+                                    page_text.append(text)
+                
+                if page_text:
+                    text_parts.append(f"\n\n--- 第 {page_num + 1} 页 ---\n\n")
+                    text_parts.append(" ".join(page_text))
+            
+            except Exception as e:
+                logger.warning(f"第 {page_num + 1} 页处理失败: {str(e)}")
+                continue
+        
+        full_text = "".join(text_parts)
+        doc.close()
+        
+        return {
+            "success": True,
+            "method": "pymupdf_layout",
+            "text": full_text,
+            "format": "plain_text",
+            "metadata": {
+                "page_count": page_count,
+                "char_count": len(full_text)
+            }
+        }
+    
+    except Exception as e:
+        logger.error(f"PyMuPDF布局提取失败: {str(e)}")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+
+
+def get_pdf_metadata(file_path: str) -> Dict[str, Any]:
+    """
+    获取PDF元数据
+    
+    Args:
+        file_path: PDF文件路径
+    
+    Returns:
+        PDF元数据
+    """
+    try:
+        doc = fitz.open(file_path)
+        
+        metadata = {
+            "page_count": len(doc),
+            "metadata": doc.metadata,
+            "is_encrypted": doc.is_encrypted,
+            "is_pdf": doc.is_pdf
+        }
+        
+        doc.close()
+        return metadata
+    
+    except Exception as e:
+        logger.error(f"获取PDF元数据失败: {str(e)}")
+        return {}
+
+
+
+
+
+
--- a/extraction_service/services/pdf_processor.py
+++ b/extraction_service/services/pdf_processor.py
@@ -0,0 +1,192 @@
+"""
+PDF处理主服务
+
+实现顺序降级策略：
+1. 检测语言
+2. 中文PDF → PyMuPDF（快速）
+3. 英文PDF → Nougat → 失败降级PyMuPDF
+"""
+
+from typing import Dict, Any, Optional
+from loguru import logger
+
+from .language_detector import detect_language
+from .nougat_extractor import extract_pdf_nougat, check_nougat_available
+from .pdf_extractor import extract_pdf_pymupdf
+
+
+def extract_pdf(
+    file_path: str,
+    force_method: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    PDF提取主函数（顺序降级策略）
+    
+    处理流程：
+    1. 检测语言
+    2. 中文 → 直接PyMuPDF
+    3. 英文 → 尝试Nougat → 失败降级PyMuPDF
+    
+    Args:
+        file_path: PDF文件路径
+        force_method: 强制使用的方法 ('nougat' | 'pymupdf')
+    
+    Returns:
+        {
+            "success": True,
+            "method": "nougat" | "pymupdf",
+            "reason": "chinese_pdf" | "english_pdf" | "nougat_failed" | "nougat_low_quality",
+            "text": "提取的文本",
+            "metadata": {...}
+        }
+    """
+    try:
+        logger.info(f"开始处理PDF: {file_path}")
+        
+        # Step 1: 语言检测
+        logger.info("[Step 1] 检测PDF语言...")
+        language = detect_language(file_path)
+        logger.info(f"检测结果: {language}")
+        
+        # 如果强制指定方法
+        if force_method:
+            logger.info(f"强制使用方法: {force_method}")
+            
+            if force_method == 'nougat':
+                return extract_pdf_nougat(file_path)
+            elif force_method == 'pymupdf':
+                result = extract_pdf_pymupdf(file_path)
+                result['reason'] = 'force_pymupdf'
+                return result
+        
+        # Step 2: 中文PDF → 直接PyMuPDF
+        if language == 'chinese':
+            logger.info("[Step 2] 中文PDF，使用PyMuPDF快速处理")
+            
+            result = extract_pdf_pymupdf(file_path)
+            
+            if result['success']:
+                result['reason'] = 'chinese_pdf'
+                result['detected_language'] = language
+                logger.info("✅ PyMuPDF处理成功（中文PDF）")
+                return result
+            else:
+                logger.error("❌ PyMuPDF处理失败")
+                return result
+        
+        # Step 3: 英文PDF → 尝试Nougat
+        logger.info("[Step 3] 英文PDF，尝试Nougat高质量解析")
+        
+        # 检查Nougat是否可用
+        if not check_nougat_available():
+            logger.warning("⚠️ Nougat不可用，降级到PyMuPDF")
+            
+            result = extract_pdf_pymupdf(file_path)
+            if result['success']:
+                result['reason'] = 'nougat_unavailable'
+                result['detected_language'] = language
+            return result
+        
+        # 尝试Nougat
+        try:
+            nougat_result = extract_pdf_nougat(file_path)
+            
+            if not nougat_result['success']:
+                logger.warning("⚠️ Nougat提取失败，降级到PyMuPDF")
+                raise Exception(nougat_result.get('error', 'Nougat failed'))
+            
+            # 质量检查
+            quality_score = nougat_result['metadata'].get('quality_score', 0)
+            
+            logger.info(f"Nougat质量评分: {quality_score:.2f}")
+            
+            # 质量阈值：0.7
+            if quality_score >= 0.7:
+                logger.info("✅ Nougat处理成功（质量合格）")
+                nougat_result['reason'] = 'english_pdf_high_quality'
+                nougat_result['detected_language'] = language
+                return nougat_result
+            else:
+                logger.warning(f"⚠️ Nougat质量不足: {quality_score:.2f}，降级到PyMuPDF")
+                raise Exception(f"Quality too low: {quality_score}")
+        
+        except Exception as e:
+            logger.warning(f"Nougat处理失败: {str(e)}，降级到PyMuPDF")
+        
+        # Step 4: 降级到PyMuPDF
+        logger.info("[Step 4] 降级使用PyMuPDF")
+        
+        result = extract_pdf_pymupdf(file_path)
+        
+        if result['success']:
+            result['reason'] = 'nougat_failed_or_low_quality'
+            result['detected_language'] = language
+            result['fallback'] = True
+            logger.info("✅ PyMuPDF处理成功（降级方案）")
+        else:
+            logger.error("❌ PyMuPDF处理也失败了")
+        
+        return result
+    
+    except Exception as e:
+        logger.error(f"PDF处理完全失败: {str(e)}")
+        return {
+            "success": False,
+            "error": str(e),
+            "method": "unknown"
+        }
+
+
+def get_pdf_processing_strategy(file_path: str) -> Dict[str, Any]:
+    """
+    获取PDF处理策略（不实际提取）
+    
+    用于预览将使用哪种方法
+    
+    Args:
+        file_path: PDF文件路径
+    
+    Returns:
+        {
+            "detected_language": "chinese" | "english",
+            "recommended_method": "nougat" | "pymupdf",
+            "reason": "...",
+            "nougat_available": True | False
+        }
+    """
+    try:
+        # 检测语言
+        language = detect_language(file_path)
+        
+        # 检查Nougat可用性
+        nougat_available = check_nougat_available()
+        
+        # 决定策略
+        if language == 'chinese':
+            recommended_method = 'pymupdf'
+            reason = '中文PDF，推荐使用PyMuPDF快速处理'
+        elif nougat_available:
+            recommended_method = 'nougat'
+            reason = '英文PDF，推荐使用Nougat高质量解析'
+        else:
+            recommended_method = 'pymupdf'
+            reason = 'Nougat不可用，使用PyMuPDF'
+        
+        return {
+            "detected_language": language,
+            "recommended_method": recommended_method,
+            "reason": reason,
+            "nougat_available": nougat_available
+        }
+    
+    except Exception as e:
+        logger.error(f"获取处理策略失败: {str(e)}")
+        return {
+            "error": str(e)
+        }
+
+
+
+
+
+
--- a/extraction_service/services/txt_extractor.py
+++ b/extraction_service/services/txt_extractor.py
@@ -0,0 +1,320 @@
+"""
+Txt文本文件提取服务
+
+直接读取纯文本文件
+支持多种编码自动检测
+"""
+
+from pathlib import Path
+from typing import Dict, Any, List
+from loguru import logger
+import chardet
+
+
+def extract_txt(file_path: str) -> Dict[str, Any]:
+    """
+    提取Txt文件内容
+    
+    特性:
+    - 自动检测编码（UTF-8, GBK, GB2312等）
+    - 支持大文件（逐块读取）
+    - 去除BOM标记
+    
+    Args:
+        file_path: Txt文件路径
+    
+    Returns:
+        {
+            "success": True,
+            "text": "文本内容",
+            "encoding": "检测到的编码",
+            "metadata": {
+                "char_count": 字符数,
+                "line_count": 行数,
+                "file_size": 文件大小
+            }
+        }
+    """
+    try:
+        file_path_obj = Path(file_path)
+        
+        # 验证文件存在
+        if not file_path_obj.exists():
+            return {
+                "success": False,
+                "error": f"文件不存在: {file_path}",
+                "text": "",
+                "metadata": {}
+            }
+        
+        # 验证文件格式
+        if file_path_obj.suffix.lower() != '.txt':
+            return {
+                "success": False,
+                "error": f"不支持的文件格式: {file_path_obj.suffix}，仅支持.txt",
+                "text": "",
+                "metadata": {}
+            }
+        
+        file_size = file_path_obj.stat().st_size
+        
+        # 空文件检查
+        if file_size == 0:
+            return {
+                "success": False,
+                "error": "文件为空",
+                "text": "",
+                "metadata": {
+                    "char_count": 0,
+                    "line_count": 0,
+                    "file_size": 0
+                }
+            }
+        
+        logger.info(f"开始提取Txt文件: {file_path_obj.name} ({file_size / 1024:.2f} KB)")
+        
+        # 检测编码
+        detected_encoding = detect_encoding(file_path)
+        logger.info(f"检测到编码: {detected_encoding}")
+        
+        # 读取文件（带编码回退）
+        text, actual_encoding = read_with_fallback(file_path, detected_encoding)
+        
+        if text is None:
+            return {
+                "success": False,
+                "error": "无法解码文件，尝试了多种编码均失败",
+                "text": "",
+                "metadata": {}
+            }
+        
+        # 统计信息
+        char_count = len(text)
+        line_count = text.count('\n') + 1
+        
+        logger.info(f"Txt提取成功: {char_count}个字符, {line_count}行")
+        
+        return {
+            "success": True,
+            "text": text,
+            "encoding": actual_encoding,
+            "metadata": {
+                "char_count": char_count,
+                "line_count": line_count,
+                "file_size": file_size,
+                "size_kb": round(file_size / 1024, 2)
+            }
+        }
+    
+    except Exception as e:
+        logger.error(f"Txt提取失败: {str(e)}")
+        return {
+            "success": False,
+            "error": str(e),
+            "text": "",
+            "metadata": {}
+        }
+
+
+def detect_encoding(file_path: str, sample_size: int = 10000) -> str:
+    """
+    检测文件编码
+    
+    Args:
+        file_path: 文件路径
+        sample_size: 采样大小（字节）
+    
+    Returns:
+        检测到的编码名称
+    """
+    try:
+        with open(file_path, 'rb') as f:
+            raw_data = f.read(sample_size)
+        
+        # 使用chardet检测
+        result = chardet.detect(raw_data)
+        encoding = result['encoding']
+        confidence = result['confidence']
+        
+        logger.debug(f"编码检测: {encoding} (置信度: {confidence:.2f})")
+        
+        # 如果置信度太低，使用UTF-8作为默认
+        if confidence < 0.7:
+            logger.warning(f"编码置信度较低({confidence:.2f})，将尝试UTF-8")
+            return 'utf-8'
+        
+        return encoding if encoding else 'utf-8'
+    
+    except Exception as e:
+        logger.warning(f"编码检测失败: {str(e)}，使用UTF-8")
+        return 'utf-8'
+
+
+def read_with_fallback(file_path: str, primary_encoding: str) -> tuple[str, str]:
+    """
+    尝试多种编码读取文件
+    
+    Args:
+        file_path: 文件路径
+        primary_encoding: 首选编码
+    
+    Returns:
+        (文本内容, 实际使用的编码)
+    """
+    # 编码尝试列表（按优先级）
+    encodings = [
+        primary_encoding,
+        'utf-8',
+        'utf-8-sig',  # UTF-8 with BOM
+        'gbk',
+        'gb2312',
+        'gb18030',
+        'latin-1',
+        'cp1252',
+        'iso-8859-1'
+    ]
+    
+    # 去重并保持顺序
+    seen = set()
+    unique_encodings = []
+    for enc in encodings:
+        if enc and enc.lower() not in seen:
+            seen.add(enc.lower())
+            unique_encodings.append(enc)
+    
+    # 尝试每种编码
+    for encoding in unique_encodings:
+        try:
+            with open(file_path, 'r', encoding=encoding, errors='strict') as f:
+                text = f.read()
+            
+            logger.info(f"成功使用编码: {encoding}")
+            return text, encoding
+        
+        except UnicodeDecodeError:
+            logger.debug(f"编码 {encoding} 解码失败，尝试下一个")
+            continue
+        
+        except Exception as e:
+            logger.warning(f"使用编码 {encoding} 读取失败: {str(e)}")
+            continue
+    
+    # 所有编码都失败
+    logger.error("所有编码尝试均失败")
+    return None, None
+
+
+def validate_txt_file(file_path: str) -> Dict[str, Any]:
+    """
+    验证Txt文件的有效性
+    
+    Args:
+        file_path: 文件路径
+    
+    Returns:
+        {
+            "valid": True/False,
+            "reason": "原因",
+            "file_info": {文件信息}
+        }
+    """
+    try:
+        file_path_obj = Path(file_path)
+        
+        # 检查文件存在
+        if not file_path_obj.exists():
+            return {
+                "valid": False,
+                "reason": "文件不存在"
+            }
+        
+        # 检查后缀
+        if file_path_obj.suffix.lower() != '.txt':
+            return {
+                "valid": False,
+                "reason": f"不支持的格式: {file_path_obj.suffix}（仅支持.txt）"
+            }
+        
+        # 检查文件大小（限制10MB，txt文件通常较小）
+        file_size = file_path_obj.stat().st_size
+        max_size = 10 * 1024 * 1024  # 10MB
+        
+        if file_size > max_size:
+            return {
+                "valid": False,
+                "reason": f"文件过大: {file_size / 1024 / 1024:.2f}MB（限制10MB）"
+            }
+        
+        if file_size == 0:
+            return {
+                "valid": False,
+                "reason": "文件为空"
+            }
+        
+        # 尝试检测编码
+        encoding = detect_encoding(str(file_path_obj))
+        
+        return {
+            "valid": True,
+            "reason": "文件有效",
+            "file_info": {
+                "filename": file_path_obj.name,
+                "size": file_size,
+                "size_kb": round(file_size / 1024, 2),
+                "detected_encoding": encoding
+            }
+        }
+    
+    except Exception as e:
+        return {
+            "valid": False,
+            "reason": f"验证失败: {str(e)}"
+        }
+
+
+def preview_txt(file_path: str, lines: int = 10) -> Dict[str, Any]:
+    """
+    预览Txt文件前几行
+    
+    Args:
+        file_path: 文件路径
+        lines: 预览行数
+    
+    Returns:
+        {
+            "success": True,
+            "preview": "前N行内容",
+            "total_lines": 总行数（如果能快速获取）
+        }
+    """
+    try:
+        result = extract_txt(file_path)
+        
+        if not result['success']:
+            return result
+        
+        text = result['text']
+        text_lines = text.split('\n')
+        
+        preview_lines = text_lines[:lines]
+        preview = '\n'.join(preview_lines)
+        
+        return {
+            "success": True,
+            "preview": preview,
+            "total_lines": len(text_lines),
+            "preview_lines": len(preview_lines)
+        }
+    
+    except Exception as e:
+        return {
+            "success": False,
+            "error": str(e),
+            "preview": ""
+        }
+
+
+
+
+
+
--- a/extraction_service/start.bat
+++ b/extraction_service/start.bat
@@ -0,0 +1,37 @@
+@echo off
+chcp 65001 >nul
+echo ================================
+echo 启动文档提取微服务
+echo ================================
+echo.
+
+REM 检查虚拟环境
+if exist venv\Scripts\activate.bat (
+    echo [1/3] 激活虚拟环境...
+    call venv\Scripts\activate
+) else (
+    echo 警告: 未找到虚拟环境，使用全局Python
+)
+
+REM 检查依赖
+echo [2/3] 检查依赖...
+pip list | findstr "fastapi" >nul
+if errorlevel 1 (
+    echo 依赖未安装，正在安装...
+    pip install -r requirements.txt
+)
+
+REM 启动服务
+echo [3/3] 启动服务...
+echo.
+echo 服务地址: http://localhost:8000
+echo 健康检查: http://localhost:8000/api/health
+echo API文档: http://localhost:8000/docs
+echo.
+echo 按 Ctrl+C 停止服务
+echo.
+
+uvicorn main:app --host 0.0.0.0 --port 8000 --reload
+
+pause
+
--- a/extraction_service/test_files/test.txt
+++ b/extraction_service/test_files/test.txt
@@ -0,0 +1,29 @@
+这是一个测试文本文件。
+用于测试Txt文件提取功能。
+
+AI临床研究平台 - Phase 2 Day 3测试
+
+功能特点：
+1. 自动编码检测
+2. 支持UTF-8、GBK等多种编码
+3. 统计字符数和行数
+4. 快速文本提取
+
+测试内容包含：
+- 中文字符
+- 英文字符 (English characters)
+- 数字 123456
+- 特殊符号 !@#$%^&*()
+
+多行文本测试：
+第一行
+第二行
+第三行
+
+结束。
+
+
+
+
+
+
--- a/extraction_service/test_service.py
+++ b/extraction_service/test_service.py
@@ -0,0 +1,171 @@
+"""
+服务测试脚本
+
+测试文档提取微服务的各项功能
+"""
+
+import requests
+import sys
+from pathlib import Path
+
+
+BASE_URL = "http://localhost:8000"
+
+
+def test_health():
+    """测试健康检查"""
+    print("\n" + "="*50)
+    print("测试1: 健康检查")
+    print("="*50)
+    
+    try:
+        response = requests.get(f"{BASE_URL}/api/health")
+        print(f"状态码: {response.status_code}")
+        
+        if response.status_code == 200:
+            data = response.json()
+            print(f"服务状态: {data['status']}")
+            print(f"PyMuPDF: {data['checks']['pymupdf']['available']} (v{data['checks']['pymupdf']['version']})")
+            print(f"临时目录: {data['checks']['temp_dir']['path']}")
+            print("✅ 健康检查通过")
+            return True
+        else:
+            print("❌ 健康检查失败")
+            return False
+    except Exception as e:
+        print(f"❌ 连接失败: {str(e)}")
+        print("提示: 请确保服务已启动（python main.py）")
+        return False
+
+
+def test_pdf_extraction(pdf_file: str = None):
+    """测试PDF提取"""
+    print("\n" + "="*50)
+    print("测试2: PDF文本提取")
+    print("="*50)
+    
+    if not pdf_file:
+        print("跳过: 未提供测试PDF文件")
+        print("使用方法: python test_service.py <pdf文件路径>")
+        return None
+    
+    pdf_path = Path(pdf_file)
+    
+    if not pdf_path.exists():
+        print(f"❌ 文件不存在: {pdf_file}")
+        return False
+    
+    try:
+        print(f"上传文件: {pdf_path.name}")
+        print(f"文件大小: {pdf_path.stat().st_size / 1024:.2f} KB")
+        
+        with open(pdf_path, 'rb') as f:
+            files = {'file': (pdf_path.name, f, 'application/pdf')}
+            response = requests.post(
+                f"{BASE_URL}/api/extract/pdf",
+                files=files
+            )
+        
+        print(f"状态码: {response.status_code}")
+        
+        if response.status_code == 200:
+            data = response.json()
+            
+            print("\n提取结果:")
+            print(f"方法: {data['method']}")
+            print(f"页数: {data['metadata']['page_count']}")
+            print(f"字符数: {data['metadata']['char_count']}")
+            print(f"文本长度: {len(data['text'])} 字符")
+            
+            # 显示前500字符
+            print("\n文本预览:")
+            print("-" * 50)
+            print(data['text'][:500])
+            if len(data['text']) > 500:
+                print("...")
+            print("-" * 50)
+            
+            print("\n✅ PDF提取成功")
+            return True
+        else:
+            print(f"❌ 提取失败: {response.text}")
+            return False
+    
+    except Exception as e:
+        print(f"❌ 请求失败: {str(e)}")
+        return False
+
+
+def test_root():
+    """测试根路径"""
+    print("\n" + "="*50)
+    print("测试0: 根路径")
+    print("="*50)
+    
+    try:
+        response = requests.get(f"{BASE_URL}/")
+        print(f"状态码: {response.status_code}")
+        
+        if response.status_code == 200:
+            data = response.json()
+            print(f"服务: {data['service']}")
+            print(f"版本: {data['version']}")
+            print("✅ 根路径正常")
+            return True
+        else:
+            print("❌ 根路径异常")
+            return False
+    except Exception as e:
+        print(f"❌ 连接失败: {str(e)}")
+        return False
+
+
+def main():
+    """主测试函数"""
+    print("\n" + "="*50)
+    print("文档提取微服务 - 测试套件")
+    print("="*50)
+    
+    # 获取PDF文件路径（如果提供）
+    pdf_file = sys.argv[1] if len(sys.argv) > 1 else None
+    
+    # 运行测试
+    results = []
+    
+    results.append(("根路径", test_root()))
+    results.append(("健康检查", test_health()))
+    results.append(("PDF提取", test_pdf_extraction(pdf_file)))
+    
+    # 总结
+    print("\n" + "="*50)
+    print("测试总结")
+    print("="*50)
+    
+    for name, result in results:
+        if result is True:
+            status = "✅ 通过"
+        elif result is False:
+            status = "❌ 失败"
+        else:
+            status = "⏭️  跳过"
+        print(f"{name}: {status}")
+    
+    passed = sum(1 for _, r in results if r is True)
+    total = len([r for _, r in results if r is not None])
+    
+    print(f"\n通过率: {passed}/{total}")
+    
+    if passed == total:
+        print("\n🎉 所有测试通过！")
+    else:
+        print("\n⚠️  部分测试失败")
+
+
+if __name__ == "__main__":
+    main()
+
+
+
+
+
+