Files
AIclinicalresearch/extraction_service/test_forensics.py
HaHafeng e785969e54 feat(rvw): Implement RVW V2.0 Data Forensics Module - Day 6 StatValidator
Summary:
- Implement L2 Statistical Validator (CI-P consistency, T-test reverse)
- Implement L2.5 Consistency Forensics (SE Triangle, SD>Mean check)
- Add error/warning severity classification with tolerance thresholds
- Support 5+ CI formats parsing (parentheses, brackets, 95% CI prefix)
- Complete Python forensics service (types, config, validator, extractor)

V2.0 Development Progress (Week 2 Day 6):
- Day 1-5: Python service setup, Word table extraction, L1 arithmetic validator
- Day 6: L2 StatValidator + L2.5 consistency forensics (promoted from V2.1)

Test Results:
- Unit tests: 4/4 passed (CI-P, SE Triangle, SD>Mean, T-test)
- Real document tests: 5/5 successful, 2 reasonable WARNINGs

Status: Day 6 completed, ready for Day 7 (Skills Framework)
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-17 22:15:27 +08:00

188 lines
6.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
数据侦探模块测试脚本
测试 forensics 模块的表格提取和验证功能。
"""
import os
import sys
from pathlib import Path
# 添加项目路径
sys.path.insert(0, str(Path(__file__).parent))
from forensics.types import ForensicsConfig
from forensics.extractor import DocxTableExtractor
from forensics.validator import ArithmeticValidator, StatValidator
from forensics.config import detect_methods
# 测试文件目录
TEST_DOCS_DIR = Path(__file__).parent.parent / "docs" / "03-业务模块" / "RVW-稿件审查系统" / "05-测试文档"
def test_single_file(file_path: Path) -> dict:
"""测试单个文件"""
print(f"\n{'='*60}")
print(f"📄 测试文件: {file_path.name}")
print(f" 大小: {file_path.stat().st_size / 1024:.1f} KB")
print(f"{'='*60}")
# 创建配置
config = ForensicsConfig(
check_level="L1_L2",
tolerance_percent=0.1,
max_table_rows=500
)
# 提取表格
extractor = DocxTableExtractor(config)
try:
tables, full_text = extractor.extract(str(file_path))
except Exception as e:
print(f"❌ 提取失败: {e}")
return {"success": False, "error": str(e)}
print(f"\n📊 提取结果:")
print(f" - 表格数量: {len(tables)}")
print(f" - 全文长度: {len(full_text)} 字符")
# 检测统计方法
methods = detect_methods(full_text)
print(f" - 检测到的统计方法: {methods if methods else ''}")
# 显示表格信息
for table in tables:
print(f"\n 📋 表格 {table.id}:")
print(f" - Caption: {table.caption[:50] if table.caption else ''}...")
print(f" - 类型: {table.type}")
print(f" - 大小: {table.row_count}× {table.col_count}")
print(f" - 跳过: {table.skipped}")
# 显示前 3 行数据预览
if table.data and not table.skipped:
print(f" - 数据预览 (前 3 行):")
for i, row in enumerate(table.data[:3]):
row_preview = " | ".join([str(cell)[:15] for cell in row[:4]])
print(f" Row {i+1}: {row_preview}...")
# L1 算术验证
print(f"\n🔍 L1 算术验证:")
arithmetic_validator = ArithmeticValidator(config)
for table in tables:
if not table.skipped:
arithmetic_validator.validate(table)
# L2 统计验证
print(f"🔬 L2 统计验证:")
stat_validator = StatValidator(config)
for table in tables:
if not table.skipped:
stat_validator.validate(table, full_text)
# 统计问题
total_issues = 0
error_count = 0
warning_count = 0
for table in tables:
for issue in table.issues:
total_issues += 1
if issue.severity.value == "ERROR":
error_count += 1
elif issue.severity.value == "WARNING":
warning_count += 1
# 显示问题详情
print(f"\n ⚠️ [{issue.severity.value}] {issue.type.value}")
print(f" 位置: {issue.location.cell_ref if issue.location else 'N/A'}")
print(f" 描述: {issue.message}")
if issue.evidence:
print(f" 证据: {issue.evidence}")
print(f"\n📈 统计:")
print(f" - 总问题数: {total_issues}")
print(f" - ERROR: {error_count}")
print(f" - WARNING: {warning_count}")
# 显示 HTML 预览(第一个表格)
if tables and not tables[0].skipped:
html_preview = tables[0].html[:500] if len(tables[0].html) > 500 else tables[0].html
print(f"\n📝 HTML 预览 (表格 0):")
print(html_preview)
print("...")
return {
"success": True,
"file": file_path.name,
"tables": len(tables),
"methods": methods,
"total_issues": total_issues,
"error_count": error_count,
"warning_count": warning_count
}
def main():
"""主测试函数"""
print("=" * 70)
print("🔬 RVW V2.0 数据侦探模块测试")
print("=" * 70)
# 检查测试目录
if not TEST_DOCS_DIR.exists():
print(f"❌ 测试目录不存在: {TEST_DOCS_DIR}")
return
# 获取所有 .docx 文件
docx_files = list(TEST_DOCS_DIR.glob("*.docx"))
if not docx_files:
print(f"❌ 测试目录中没有 .docx 文件")
return
print(f"\n📁 测试目录: {TEST_DOCS_DIR}")
print(f"📄 找到 {len(docx_files)} 个测试文件")
# 测试每个文件
results = []
for file_path in docx_files:
try:
result = test_single_file(file_path)
results.append(result)
except Exception as e:
print(f"\n❌ 测试 {file_path.name} 时出错: {e}")
import traceback
traceback.print_exc()
results.append({
"success": False,
"file": file_path.name,
"error": str(e)
})
# 汇总结果
print("\n" + "=" * 70)
print("📊 测试汇总")
print("=" * 70)
success_count = sum(1 for r in results if r.get("success"))
total_tables = sum(r.get("tables", 0) for r in results if r.get("success"))
total_issues = sum(r.get("total_issues", 0) for r in results if r.get("success"))
total_errors = sum(r.get("error_count", 0) for r in results if r.get("success"))
print(f"\n✅ 成功: {success_count}/{len(results)}")
print(f"📋 总表格数: {total_tables}")
print(f"⚠️ 总问题数: {total_issues} (ERROR: {total_errors})")
print("\n📝 详细结果:")
for r in results:
status = "" if r.get("success") else ""
print(f" {status} {r.get('file', 'Unknown')}")
if r.get("success"):
print(f" 表格: {r.get('tables', 0)}, 问题: {r.get('total_issues', 0)}, 方法: {r.get('methods', [])}")
else:
print(f" 错误: {r.get('error', 'Unknown')}")
if __name__ == "__main__":
main()