feat: add extraction_service (PDF/Docx/Txt) and update .gitignore to exclude venv
This commit is contained in:
171
extraction_service/test_service.py
Normal file
171
extraction_service/test_service.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""
|
||||
服务测试脚本
|
||||
|
||||
测试文档提取微服务的各项功能
|
||||
"""
|
||||
|
||||
import requests
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
BASE_URL = "http://localhost:8000"
|
||||
|
||||
|
||||
def test_health():
|
||||
"""测试健康检查"""
|
||||
print("\n" + "="*50)
|
||||
print("测试1: 健康检查")
|
||||
print("="*50)
|
||||
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/api/health")
|
||||
print(f"状态码: {response.status_code}")
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print(f"服务状态: {data['status']}")
|
||||
print(f"PyMuPDF: {data['checks']['pymupdf']['available']} (v{data['checks']['pymupdf']['version']})")
|
||||
print(f"临时目录: {data['checks']['temp_dir']['path']}")
|
||||
print("✅ 健康检查通过")
|
||||
return True
|
||||
else:
|
||||
print("❌ 健康检查失败")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"❌ 连接失败: {str(e)}")
|
||||
print("提示: 请确保服务已启动(python main.py)")
|
||||
return False
|
||||
|
||||
|
||||
def test_pdf_extraction(pdf_file: str = None):
|
||||
"""测试PDF提取"""
|
||||
print("\n" + "="*50)
|
||||
print("测试2: PDF文本提取")
|
||||
print("="*50)
|
||||
|
||||
if not pdf_file:
|
||||
print("跳过: 未提供测试PDF文件")
|
||||
print("使用方法: python test_service.py <pdf文件路径>")
|
||||
return None
|
||||
|
||||
pdf_path = Path(pdf_file)
|
||||
|
||||
if not pdf_path.exists():
|
||||
print(f"❌ 文件不存在: {pdf_file}")
|
||||
return False
|
||||
|
||||
try:
|
||||
print(f"上传文件: {pdf_path.name}")
|
||||
print(f"文件大小: {pdf_path.stat().st_size / 1024:.2f} KB")
|
||||
|
||||
with open(pdf_path, 'rb') as f:
|
||||
files = {'file': (pdf_path.name, f, 'application/pdf')}
|
||||
response = requests.post(
|
||||
f"{BASE_URL}/api/extract/pdf",
|
||||
files=files
|
||||
)
|
||||
|
||||
print(f"状态码: {response.status_code}")
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
|
||||
print("\n提取结果:")
|
||||
print(f"方法: {data['method']}")
|
||||
print(f"页数: {data['metadata']['page_count']}")
|
||||
print(f"字符数: {data['metadata']['char_count']}")
|
||||
print(f"文本长度: {len(data['text'])} 字符")
|
||||
|
||||
# 显示前500字符
|
||||
print("\n文本预览:")
|
||||
print("-" * 50)
|
||||
print(data['text'][:500])
|
||||
if len(data['text']) > 500:
|
||||
print("...")
|
||||
print("-" * 50)
|
||||
|
||||
print("\n✅ PDF提取成功")
|
||||
return True
|
||||
else:
|
||||
print(f"❌ 提取失败: {response.text}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 请求失败: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def test_root():
|
||||
"""测试根路径"""
|
||||
print("\n" + "="*50)
|
||||
print("测试0: 根路径")
|
||||
print("="*50)
|
||||
|
||||
try:
|
||||
response = requests.get(f"{BASE_URL}/")
|
||||
print(f"状态码: {response.status_code}")
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print(f"服务: {data['service']}")
|
||||
print(f"版本: {data['version']}")
|
||||
print("✅ 根路径正常")
|
||||
return True
|
||||
else:
|
||||
print("❌ 根路径异常")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"❌ 连接失败: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""主测试函数"""
|
||||
print("\n" + "="*50)
|
||||
print("文档提取微服务 - 测试套件")
|
||||
print("="*50)
|
||||
|
||||
# 获取PDF文件路径(如果提供)
|
||||
pdf_file = sys.argv[1] if len(sys.argv) > 1 else None
|
||||
|
||||
# 运行测试
|
||||
results = []
|
||||
|
||||
results.append(("根路径", test_root()))
|
||||
results.append(("健康检查", test_health()))
|
||||
results.append(("PDF提取", test_pdf_extraction(pdf_file)))
|
||||
|
||||
# 总结
|
||||
print("\n" + "="*50)
|
||||
print("测试总结")
|
||||
print("="*50)
|
||||
|
||||
for name, result in results:
|
||||
if result is True:
|
||||
status = "✅ 通过"
|
||||
elif result is False:
|
||||
status = "❌ 失败"
|
||||
else:
|
||||
status = "⏭️ 跳过"
|
||||
print(f"{name}: {status}")
|
||||
|
||||
passed = sum(1 for _, r in results if r is True)
|
||||
total = len([r for _, r in results if r is not None])
|
||||
|
||||
print(f"\n通过率: {passed}/{total}")
|
||||
|
||||
if passed == total:
|
||||
print("\n🎉 所有测试通过!")
|
||||
else:
|
||||
print("\n⚠️ 部分测试失败")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user