feat(rvw): Implement RVW V2.0 Data Forensics Module - Day 6 StatValidator
Summary: - Implement L2 Statistical Validator (CI-P consistency, T-test reverse) - Implement L2.5 Consistency Forensics (SE Triangle, SD>Mean check) - Add error/warning severity classification with tolerance thresholds - Support 5+ CI formats parsing (parentheses, brackets, 95% CI prefix) - Complete Python forensics service (types, config, validator, extractor) V2.0 Development Progress (Week 2 Day 6): - Day 1-5: Python service setup, Word table extraction, L1 arithmetic validator - Day 6: L2 StatValidator + L2.5 consistency forensics (promoted from V2.1) Test Results: - Unit tests: 4/4 passed (CI-P, SE Triangle, SD>Mean, T-test) - Real document tests: 5/5 successful, 2 reasonable WARNINGs Status: Day 6 completed, ready for Day 7 (Skills Framework) Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
187
extraction_service/test_forensics.py
Normal file
187
extraction_service/test_forensics.py
Normal file
@@ -0,0 +1,187 @@
|
||||
"""
|
||||
数据侦探模块测试脚本
|
||||
|
||||
测试 forensics 模块的表格提取和验证功能。
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# 添加项目路径
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from forensics.types import ForensicsConfig
|
||||
from forensics.extractor import DocxTableExtractor
|
||||
from forensics.validator import ArithmeticValidator, StatValidator
|
||||
from forensics.config import detect_methods
|
||||
|
||||
# 测试文件目录
|
||||
TEST_DOCS_DIR = Path(__file__).parent.parent / "docs" / "03-业务模块" / "RVW-稿件审查系统" / "05-测试文档"
|
||||
|
||||
|
||||
def test_single_file(file_path: Path) -> dict:
|
||||
"""测试单个文件"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"📄 测试文件: {file_path.name}")
|
||||
print(f" 大小: {file_path.stat().st_size / 1024:.1f} KB")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# 创建配置
|
||||
config = ForensicsConfig(
|
||||
check_level="L1_L2",
|
||||
tolerance_percent=0.1,
|
||||
max_table_rows=500
|
||||
)
|
||||
|
||||
# 提取表格
|
||||
extractor = DocxTableExtractor(config)
|
||||
try:
|
||||
tables, full_text = extractor.extract(str(file_path))
|
||||
except Exception as e:
|
||||
print(f"❌ 提取失败: {e}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
print(f"\n📊 提取结果:")
|
||||
print(f" - 表格数量: {len(tables)}")
|
||||
print(f" - 全文长度: {len(full_text)} 字符")
|
||||
|
||||
# 检测统计方法
|
||||
methods = detect_methods(full_text)
|
||||
print(f" - 检测到的统计方法: {methods if methods else '无'}")
|
||||
|
||||
# 显示表格信息
|
||||
for table in tables:
|
||||
print(f"\n 📋 表格 {table.id}:")
|
||||
print(f" - Caption: {table.caption[:50] if table.caption else '无'}...")
|
||||
print(f" - 类型: {table.type}")
|
||||
print(f" - 大小: {table.row_count} 行 × {table.col_count} 列")
|
||||
print(f" - 跳过: {table.skipped}")
|
||||
|
||||
# 显示前 3 行数据预览
|
||||
if table.data and not table.skipped:
|
||||
print(f" - 数据预览 (前 3 行):")
|
||||
for i, row in enumerate(table.data[:3]):
|
||||
row_preview = " | ".join([str(cell)[:15] for cell in row[:4]])
|
||||
print(f" Row {i+1}: {row_preview}...")
|
||||
|
||||
# L1 算术验证
|
||||
print(f"\n🔍 L1 算术验证:")
|
||||
arithmetic_validator = ArithmeticValidator(config)
|
||||
for table in tables:
|
||||
if not table.skipped:
|
||||
arithmetic_validator.validate(table)
|
||||
|
||||
# L2 统计验证
|
||||
print(f"🔬 L2 统计验证:")
|
||||
stat_validator = StatValidator(config)
|
||||
for table in tables:
|
||||
if not table.skipped:
|
||||
stat_validator.validate(table, full_text)
|
||||
|
||||
# 统计问题
|
||||
total_issues = 0
|
||||
error_count = 0
|
||||
warning_count = 0
|
||||
|
||||
for table in tables:
|
||||
for issue in table.issues:
|
||||
total_issues += 1
|
||||
if issue.severity.value == "ERROR":
|
||||
error_count += 1
|
||||
elif issue.severity.value == "WARNING":
|
||||
warning_count += 1
|
||||
|
||||
# 显示问题详情
|
||||
print(f"\n ⚠️ [{issue.severity.value}] {issue.type.value}")
|
||||
print(f" 位置: {issue.location.cell_ref if issue.location else 'N/A'}")
|
||||
print(f" 描述: {issue.message}")
|
||||
if issue.evidence:
|
||||
print(f" 证据: {issue.evidence}")
|
||||
|
||||
print(f"\n📈 统计:")
|
||||
print(f" - 总问题数: {total_issues}")
|
||||
print(f" - ERROR: {error_count}")
|
||||
print(f" - WARNING: {warning_count}")
|
||||
|
||||
# 显示 HTML 预览(第一个表格)
|
||||
if tables and not tables[0].skipped:
|
||||
html_preview = tables[0].html[:500] if len(tables[0].html) > 500 else tables[0].html
|
||||
print(f"\n📝 HTML 预览 (表格 0):")
|
||||
print(html_preview)
|
||||
print("...")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"file": file_path.name,
|
||||
"tables": len(tables),
|
||||
"methods": methods,
|
||||
"total_issues": total_issues,
|
||||
"error_count": error_count,
|
||||
"warning_count": warning_count
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
"""主测试函数"""
|
||||
print("=" * 70)
|
||||
print("🔬 RVW V2.0 数据侦探模块测试")
|
||||
print("=" * 70)
|
||||
|
||||
# 检查测试目录
|
||||
if not TEST_DOCS_DIR.exists():
|
||||
print(f"❌ 测试目录不存在: {TEST_DOCS_DIR}")
|
||||
return
|
||||
|
||||
# 获取所有 .docx 文件
|
||||
docx_files = list(TEST_DOCS_DIR.glob("*.docx"))
|
||||
|
||||
if not docx_files:
|
||||
print(f"❌ 测试目录中没有 .docx 文件")
|
||||
return
|
||||
|
||||
print(f"\n📁 测试目录: {TEST_DOCS_DIR}")
|
||||
print(f"📄 找到 {len(docx_files)} 个测试文件")
|
||||
|
||||
# 测试每个文件
|
||||
results = []
|
||||
for file_path in docx_files:
|
||||
try:
|
||||
result = test_single_file(file_path)
|
||||
results.append(result)
|
||||
except Exception as e:
|
||||
print(f"\n❌ 测试 {file_path.name} 时出错: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
results.append({
|
||||
"success": False,
|
||||
"file": file_path.name,
|
||||
"error": str(e)
|
||||
})
|
||||
|
||||
# 汇总结果
|
||||
print("\n" + "=" * 70)
|
||||
print("📊 测试汇总")
|
||||
print("=" * 70)
|
||||
|
||||
success_count = sum(1 for r in results if r.get("success"))
|
||||
total_tables = sum(r.get("tables", 0) for r in results if r.get("success"))
|
||||
total_issues = sum(r.get("total_issues", 0) for r in results if r.get("success"))
|
||||
total_errors = sum(r.get("error_count", 0) for r in results if r.get("success"))
|
||||
|
||||
print(f"\n✅ 成功: {success_count}/{len(results)}")
|
||||
print(f"📋 总表格数: {total_tables}")
|
||||
print(f"⚠️ 总问题数: {total_issues} (ERROR: {total_errors})")
|
||||
|
||||
print("\n📝 详细结果:")
|
||||
for r in results:
|
||||
status = "✅" if r.get("success") else "❌"
|
||||
print(f" {status} {r.get('file', 'Unknown')}")
|
||||
if r.get("success"):
|
||||
print(f" 表格: {r.get('tables', 0)}, 问题: {r.get('total_issues', 0)}, 方法: {r.get('methods', [])}")
|
||||
else:
|
||||
print(f" 错误: {r.get('error', 'Unknown')}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user