""" 数据侦探模块测试脚本 测试 forensics 模块的表格提取和验证功能。 """ import os import sys from pathlib import Path # 添加项目路径 sys.path.insert(0, str(Path(__file__).parent)) from forensics.types import ForensicsConfig from forensics.extractor import DocxTableExtractor from forensics.validator import ArithmeticValidator, StatValidator from forensics.config import detect_methods # 测试文件目录 TEST_DOCS_DIR = Path(__file__).parent.parent / "docs" / "03-业务模块" / "RVW-稿件审查系统" / "05-测试文档" def test_single_file(file_path: Path) -> dict: """测试单个文件""" print(f"\n{'='*60}") print(f"📄 测试文件: {file_path.name}") print(f" 大小: {file_path.stat().st_size / 1024:.1f} KB") print(f"{'='*60}") # 创建配置 config = ForensicsConfig( check_level="L1_L2", tolerance_percent=0.1, max_table_rows=500 ) # 提取表格 extractor = DocxTableExtractor(config) try: tables, full_text = extractor.extract(str(file_path)) except Exception as e: print(f"❌ 提取失败: {e}") return {"success": False, "error": str(e)} print(f"\n📊 提取结果:") print(f" - 表格数量: {len(tables)}") print(f" - 全文长度: {len(full_text)} 字符") # 检测统计方法 methods = detect_methods(full_text) print(f" - 检测到的统计方法: {methods if methods else '无'}") # 显示表格信息 for table in tables: print(f"\n 📋 表格 {table.id}:") print(f" - Caption: {table.caption[:50] if table.caption else '无'}...") print(f" - 类型: {table.type}") print(f" - 大小: {table.row_count} 行 × {table.col_count} 列") print(f" - 跳过: {table.skipped}") # 显示前 3 行数据预览 if table.data and not table.skipped: print(f" - 数据预览 (前 3 行):") for i, row in enumerate(table.data[:3]): row_preview = " | ".join([str(cell)[:15] for cell in row[:4]]) print(f" Row {i+1}: {row_preview}...") # L1 算术验证 print(f"\n🔍 L1 算术验证:") arithmetic_validator = ArithmeticValidator(config) for table in tables: if not table.skipped: arithmetic_validator.validate(table) # L2 统计验证 print(f"🔬 L2 统计验证:") stat_validator = StatValidator(config) for table in tables: if not table.skipped: stat_validator.validate(table, full_text) # 统计问题 total_issues = 0 error_count = 0 warning_count = 0 for table in tables: for issue in table.issues: total_issues += 1 if issue.severity.value == "ERROR": error_count += 1 elif issue.severity.value == "WARNING": warning_count += 1 # 显示问题详情 print(f"\n ⚠️ [{issue.severity.value}] {issue.type.value}") print(f" 位置: {issue.location.cell_ref if issue.location else 'N/A'}") print(f" 描述: {issue.message}") if issue.evidence: print(f" 证据: {issue.evidence}") print(f"\n📈 统计:") print(f" - 总问题数: {total_issues}") print(f" - ERROR: {error_count}") print(f" - WARNING: {warning_count}") # 显示 HTML 预览(第一个表格) if tables and not tables[0].skipped: html_preview = tables[0].html[:500] if len(tables[0].html) > 500 else tables[0].html print(f"\n📝 HTML 预览 (表格 0):") print(html_preview) print("...") return { "success": True, "file": file_path.name, "tables": len(tables), "methods": methods, "total_issues": total_issues, "error_count": error_count, "warning_count": warning_count } def main(): """主测试函数""" print("=" * 70) print("🔬 RVW V2.0 数据侦探模块测试") print("=" * 70) # 检查测试目录 if not TEST_DOCS_DIR.exists(): print(f"❌ 测试目录不存在: {TEST_DOCS_DIR}") return # 获取所有 .docx 文件 docx_files = list(TEST_DOCS_DIR.glob("*.docx")) if not docx_files: print(f"❌ 测试目录中没有 .docx 文件") return print(f"\n📁 测试目录: {TEST_DOCS_DIR}") print(f"📄 找到 {len(docx_files)} 个测试文件") # 测试每个文件 results = [] for file_path in docx_files: try: result = test_single_file(file_path) results.append(result) except Exception as e: print(f"\n❌ 测试 {file_path.name} 时出错: {e}") import traceback traceback.print_exc() results.append({ "success": False, "file": file_path.name, "error": str(e) }) # 汇总结果 print("\n" + "=" * 70) print("📊 测试汇总") print("=" * 70) success_count = sum(1 for r in results if r.get("success")) total_tables = sum(r.get("tables", 0) for r in results if r.get("success")) total_issues = sum(r.get("total_issues", 0) for r in results if r.get("success")) total_errors = sum(r.get("error_count", 0) for r in results if r.get("success")) print(f"\n✅ 成功: {success_count}/{len(results)}") print(f"📋 总表格数: {total_tables}") print(f"⚠️ 总问题数: {total_issues} (ERROR: {total_errors})") print("\n📝 详细结果:") for r in results: status = "✅" if r.get("success") else "❌" print(f" {status} {r.get('file', 'Unknown')}") if r.get("success"): print(f" 表格: {r.get('tables', 0)}, 问题: {r.get('total_issues', 0)}, 方法: {r.get('methods', [])}") else: print(f" 错误: {r.get('error', 'Unknown')}") if __name__ == "__main__": main()