feat: add extraction_service (PDF/Docx/Txt) and update .gitignore to exclude venv

This commit is contained in:
AI Clinical Dev Team
2025-11-16 15:32:44 +08:00
parent 2a4f59b08b
commit 39eb62ee79
18 changed files with 2706 additions and 0 deletions

View File

@@ -0,0 +1,171 @@
"""
服务测试脚本
测试文档提取微服务的各项功能
"""
import requests
import sys
from pathlib import Path
BASE_URL = "http://localhost:8000"
def test_health():
"""测试健康检查"""
print("\n" + "="*50)
print("测试1: 健康检查")
print("="*50)
try:
response = requests.get(f"{BASE_URL}/api/health")
print(f"状态码: {response.status_code}")
if response.status_code == 200:
data = response.json()
print(f"服务状态: {data['status']}")
print(f"PyMuPDF: {data['checks']['pymupdf']['available']} (v{data['checks']['pymupdf']['version']})")
print(f"临时目录: {data['checks']['temp_dir']['path']}")
print("✅ 健康检查通过")
return True
else:
print("❌ 健康检查失败")
return False
except Exception as e:
print(f"❌ 连接失败: {str(e)}")
print("提示: 请确保服务已启动python main.py")
return False
def test_pdf_extraction(pdf_file: str = None):
"""测试PDF提取"""
print("\n" + "="*50)
print("测试2: PDF文本提取")
print("="*50)
if not pdf_file:
print("跳过: 未提供测试PDF文件")
print("使用方法: python test_service.py <pdf文件路径>")
return None
pdf_path = Path(pdf_file)
if not pdf_path.exists():
print(f"❌ 文件不存在: {pdf_file}")
return False
try:
print(f"上传文件: {pdf_path.name}")
print(f"文件大小: {pdf_path.stat().st_size / 1024:.2f} KB")
with open(pdf_path, 'rb') as f:
files = {'file': (pdf_path.name, f, 'application/pdf')}
response = requests.post(
f"{BASE_URL}/api/extract/pdf",
files=files
)
print(f"状态码: {response.status_code}")
if response.status_code == 200:
data = response.json()
print("\n提取结果:")
print(f"方法: {data['method']}")
print(f"页数: {data['metadata']['page_count']}")
print(f"字符数: {data['metadata']['char_count']}")
print(f"文本长度: {len(data['text'])} 字符")
# 显示前500字符
print("\n文本预览:")
print("-" * 50)
print(data['text'][:500])
if len(data['text']) > 500:
print("...")
print("-" * 50)
print("\n✅ PDF提取成功")
return True
else:
print(f"❌ 提取失败: {response.text}")
return False
except Exception as e:
print(f"❌ 请求失败: {str(e)}")
return False
def test_root():
"""测试根路径"""
print("\n" + "="*50)
print("测试0: 根路径")
print("="*50)
try:
response = requests.get(f"{BASE_URL}/")
print(f"状态码: {response.status_code}")
if response.status_code == 200:
data = response.json()
print(f"服务: {data['service']}")
print(f"版本: {data['version']}")
print("✅ 根路径正常")
return True
else:
print("❌ 根路径异常")
return False
except Exception as e:
print(f"❌ 连接失败: {str(e)}")
return False
def main():
"""主测试函数"""
print("\n" + "="*50)
print("文档提取微服务 - 测试套件")
print("="*50)
# 获取PDF文件路径如果提供
pdf_file = sys.argv[1] if len(sys.argv) > 1 else None
# 运行测试
results = []
results.append(("根路径", test_root()))
results.append(("健康检查", test_health()))
results.append(("PDF提取", test_pdf_extraction(pdf_file)))
# 总结
print("\n" + "="*50)
print("测试总结")
print("="*50)
for name, result in results:
if result is True:
status = "✅ 通过"
elif result is False:
status = "❌ 失败"
else:
status = "⏭️ 跳过"
print(f"{name}: {status}")
passed = sum(1 for _, r in results if r is True)
total = len([r for _, r in results if r is not None])
print(f"\n通过率: {passed}/{total}")
if passed == total:
print("\n🎉 所有测试通过!")
else:
print("\n⚠️ 部分测试失败")
if __name__ == "__main__":
main()