AIclinicalresearch/extraction_service/requirements.txt

# FastAPI核心依赖
fastapi==0.104.1
uvicorn[standard]==0.24.0
python-multipart==0.0.6

# PDF处理 - 使用 pymupdf4llm（替代 nougat，更轻量）
pymupdf4llm>=0.0.17         # PDF → Markdown，自动包含 pymupdf
pdfplumber==0.10.3          # 备用 PDF 处理

# Word处理
mammoth==1.6.0              # Docx → Markdown
python-docx==1.1.0          # Docx 读取
pypandoc>=1.13              # Markdown → Docx (需要系统安装 pandoc)

# Excel/CSV处理
pandas>=2.0.0               # 表格处理
openpyxl>=3.1.2             # Excel 读取

# 统计验证 (RVW V2.0 数据侦探)
scipy>=1.11.0               # T检验、卡方检验逆向计算
tabulate>=0.9.0             # DataFrame → Markdown

# PPT处理
python-pptx>=0.6.23         # PPT 读取

# 语言检测
langdetect==1.0.9

# 编码检测
chardet==5.2.0

# 工具
python-dotenv==1.0.0
pydantic>=2.10.0

# 日志
loguru==0.7.2

# 测试工具
requests==2.31.0