AIclinicalresearch/extraction_service/test_service.py

"""
服务测试脚本

测试文档提取微服务的各项功能
"""

import requests
import sys
from pathlib import Path


BASE_URL = "http://localhost:8000"


def test_health():
    """测试健康检查"""
    print("\n" + "="*50)
    print("测试1: 健康检查")
    print("="*50)

    try:
        response = requests.get(f"{BASE_URL}/api/health")
        print(f"状态码: {response.status_code}")

        if response.status_code == 200:
            data = response.json()
            print(f"服务状态: {data['status']}")
            print(f"PyMuPDF: {data['checks']['pymupdf']['available']} (v{data['checks']['pymupdf']['version']})")
            print(f"临时目录: {data['checks']['temp_dir']['path']}")
            print("✅ 健康检查通过")
            return True
        else:
            print("❌ 健康检查失败")
            return False
    except Exception as e:
        print(f"❌ 连接失败: {str(e)}")
        print("提示: 请确保服务已启动（python main.py）")
        return False


def test_pdf_extraction(pdf_file: str = None):
    """测试PDF提取"""
    print("\n" + "="*50)
    print("测试2: PDF文本提取")
    print("="*50)

    if not pdf_file:
        print("跳过: 未提供测试PDF文件")
        print("使用方法: python test_service.py <pdf文件路径>")
        return None

    pdf_path = Path(pdf_file)

    if not pdf_path.exists():
        print(f"❌ 文件不存在: {pdf_file}")
        return False

    try:
        print(f"上传文件: {pdf_path.name}")
        print(f"文件大小: {pdf_path.stat().st_size / 1024:.2f} KB")

        with open(pdf_path, 'rb') as f:
            files = {'file': (pdf_path.name, f, 'application/pdf')}
            response = requests.post(
                f"{BASE_URL}/api/extract/pdf",
                files=files
            )

        print(f"状态码: {response.status_code}")

        if response.status_code == 200:
            data = response.json()

            print("\n提取结果:")
            print(f"方法: {data['method']}")
            print(f"页数: {data['metadata']['page_count']}")
            print(f"字符数: {data['metadata']['char_count']}")
            print(f"文本长度: {len(data['text'])} 字符")

            # 显示前500字符
            print("\n文本预览:")
            print("-" * 50)
            print(data['text'][:500])
            if len(data['text']) > 500:
                print("...")
            print("-" * 50)

            print("\n✅ PDF提取成功")
            return True
        else:
            print(f"❌ 提取失败: {response.text}")
            return False

    except Exception as e:
        print(f"❌ 请求失败: {str(e)}")
        return False


def test_root():
    """测试根路径"""
    print("\n" + "="*50)
    print("测试0: 根路径")
    print("="*50)

    try:
        response = requests.get(f"{BASE_URL}/")
        print(f"状态码: {response.status_code}")

        if response.status_code == 200:
            data = response.json()
            print(f"服务: {data['service']}")
            print(f"版本: {data['version']}")
            print("✅ 根路径正常")
            return True
        else:
            print("❌ 根路径异常")
            return False
    except Exception as e:
        print(f"❌ 连接失败: {str(e)}")
        return False


def main():
    """主测试函数"""
    print("\n" + "="*50)
    print("文档提取微服务 - 测试套件")
    print("="*50)

    # 获取PDF文件路径（如果提供）
    pdf_file = sys.argv[1] if len(sys.argv) > 1 else None

    # 运行测试
    results = []

    results.append(("根路径", test_root()))
    results.append(("健康检查", test_health()))
    results.append(("PDF提取", test_pdf_extraction(pdf_file)))

    # 总结
    print("\n" + "="*50)
    print("测试总结")
    print("="*50)

    for name, result in results:
        if result is True:
            status = "✅ 通过"
        elif result is False:
            status = "❌ 失败"
        else:
            status = "⏭️  跳过"
        print(f"{name}: {status}")

    passed = sum(1 for _, r in results if r is True)
    total = len([r for _, r in results if r is not None])

    print(f"\n通过率: {passed}/{total}")

    if passed == total:
        print("\n🎉 所有测试通过！")
    else:
        print("\n⚠️  部分测试失败")


if __name__ == "__main__":
    main()