AIclinicalresearch/extraction_service/test_forensics.py

"""
数据侦探模块测试脚本

测试 forensics 模块的表格提取和验证功能。
"""

import os
import sys
from pathlib import Path

# 添加项目路径
sys.path.insert(0, str(Path(__file__).parent))

from forensics.types import ForensicsConfig
from forensics.extractor import DocxTableExtractor
from forensics.validator import ArithmeticValidator, StatValidator
from forensics.config import detect_methods

# 测试文件目录
TEST_DOCS_DIR = Path(__file__).parent.parent / "docs" / "03-业务模块" / "RVW-稿件审查系统" / "05-测试文档"


def test_single_file(file_path: Path) -> dict:
    """测试单个文件"""
    print(f"\n{'='*60}")
    print(f"📄 测试文件: {file_path.name}")
    print(f"   大小: {file_path.stat().st_size / 1024:.1f} KB")
    print(f"{'='*60}")

    # 创建配置
    config = ForensicsConfig(
        check_level="L1_L2",
        tolerance_percent=0.1,
        max_table_rows=500
    )

    # 提取表格
    extractor = DocxTableExtractor(config)
    try:
        tables, full_text = extractor.extract(str(file_path))
    except Exception as e:
        print(f"❌ 提取失败: {e}")
        return {"success": False, "error": str(e)}

    print(f"\n📊 提取结果:")
    print(f"   - 表格数量: {len(tables)}")
    print(f"   - 全文长度: {len(full_text)} 字符")

    # 检测统计方法
    methods = detect_methods(full_text)
    print(f"   - 检测到的统计方法: {methods if methods else '无'}")

    # 显示表格信息
    for table in tables:
        print(f"\n   📋 表格 {table.id}:")
        print(f"      - Caption: {table.caption[:50] if table.caption else '无'}...")
        print(f"      - 类型: {table.type}")
        print(f"      - 大小: {table.row_count} 行 × {table.col_count} 列")
        print(f"      - 跳过: {table.skipped}")

        # 显示前 3 行数据预览
        if table.data and not table.skipped:
            print(f"      - 数据预览 (前 3 行):")
            for i, row in enumerate(table.data[:3]):
                row_preview = " | ".join([str(cell)[:15] for cell in row[:4]])
                print(f"         Row {i+1}: {row_preview}...")

    # L1 算术验证
    print(f"\n🔍 L1 算术验证:")
    arithmetic_validator = ArithmeticValidator(config)
    for table in tables:
        if not table.skipped:
            arithmetic_validator.validate(table)

    # L2 统计验证
    print(f"🔬 L2 统计验证:")
    stat_validator = StatValidator(config)
    for table in tables:
        if not table.skipped:
            stat_validator.validate(table, full_text)

    # 统计问题
    total_issues = 0
    error_count = 0
    warning_count = 0

    for table in tables:
        for issue in table.issues:
            total_issues += 1
            if issue.severity.value == "ERROR":
                error_count += 1
            elif issue.severity.value == "WARNING":
                warning_count += 1

            # 显示问题详情
            print(f"\n   ⚠️ [{issue.severity.value}] {issue.type.value}")
            print(f"      位置: {issue.location.cell_ref if issue.location else 'N/A'}")
            print(f"      描述: {issue.message}")
            if issue.evidence:
                print(f"      证据: {issue.evidence}")

    print(f"\n📈 统计:")
    print(f"   - 总问题数: {total_issues}")
    print(f"   - ERROR: {error_count}")
    print(f"   - WARNING: {warning_count}")

    # 显示 HTML 预览（第一个表格）
    if tables and not tables[0].skipped:
        html_preview = tables[0].html[:500] if len(tables[0].html) > 500 else tables[0].html
        print(f"\n📝 HTML 预览 (表格 0):")
        print(html_preview)
        print("...")

    return {
        "success": True,
        "file": file_path.name,
        "tables": len(tables),
        "methods": methods,
        "total_issues": total_issues,
        "error_count": error_count,
        "warning_count": warning_count
    }


def main():
    """主测试函数"""
    print("=" * 70)
    print("🔬 RVW V2.0 数据侦探模块测试")
    print("=" * 70)

    # 检查测试目录
    if not TEST_DOCS_DIR.exists():
        print(f"❌ 测试目录不存在: {TEST_DOCS_DIR}")
        return

    # 获取所有 .docx 文件
    docx_files = list(TEST_DOCS_DIR.glob("*.docx"))

    if not docx_files:
        print(f"❌ 测试目录中没有 .docx 文件")
        return

    print(f"\n📁 测试目录: {TEST_DOCS_DIR}")
    print(f"📄 找到 {len(docx_files)} 个测试文件")

    # 测试每个文件
    results = []
    for file_path in docx_files:
        try:
            result = test_single_file(file_path)
            results.append(result)
        except Exception as e:
            print(f"\n❌ 测试 {file_path.name} 时出错: {e}")
            import traceback
            traceback.print_exc()
            results.append({
                "success": False,
                "file": file_path.name,
                "error": str(e)
            })

    # 汇总结果
    print("\n" + "=" * 70)
    print("📊 测试汇总")
    print("=" * 70)

    success_count = sum(1 for r in results if r.get("success"))
    total_tables = sum(r.get("tables", 0) for r in results if r.get("success"))
    total_issues = sum(r.get("total_issues", 0) for r in results if r.get("success"))
    total_errors = sum(r.get("error_count", 0) for r in results if r.get("success"))

    print(f"\n✅ 成功: {success_count}/{len(results)}")
    print(f"📋 总表格数: {total_tables}")
    print(f"⚠️ 总问题数: {total_issues} (ERROR: {total_errors})")

    print("\n📝 详细结果:")
    for r in results:
        status = "✅" if r.get("success") else "❌"
        print(f"   {status} {r.get('file', 'Unknown')}")
        if r.get("success"):
            print(f"      表格: {r.get('tables', 0)}, 问题: {r.get('total_issues', 0)}, 方法: {r.get('methods', [])}")
        else:
            print(f"      错误: {r.get('error', 'Unknown')}")


if __name__ == "__main__":
    main()