Summary: - Implement L2 Statistical Validator (CI-P consistency, T-test reverse) - Implement L2.5 Consistency Forensics (SE Triangle, SD>Mean check) - Add error/warning severity classification with tolerance thresholds - Support 5+ CI formats parsing (parentheses, brackets, 95% CI prefix) - Complete Python forensics service (types, config, validator, extractor) V2.0 Development Progress (Week 2 Day 6): - Day 1-5: Python service setup, Word table extraction, L1 arithmetic validator - Day 6: L2 StatValidator + L2.5 consistency forensics (promoted from V2.1) Test Results: - Unit tests: 4/4 passed (CI-P, SE Triangle, SD>Mean, T-test) - Real document tests: 5/5 successful, 2 reasonable WARNINGs Status: Day 6 completed, ready for Day 7 (Skills Framework) Co-authored-by: Cursor <cursoragent@cursor.com>
246 lines
7.6 KiB
Python
246 lines
7.6 KiB
Python
"""
|
||
Day 6 验证器测试脚本
|
||
|
||
测试内容:
|
||
1. T 检验逆向验证
|
||
2. SE 三角验证
|
||
3. SD > Mean 检查
|
||
4. CI vs P 值逻辑检查
|
||
"""
|
||
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
# 添加项目路径
|
||
sys.path.insert(0, str(Path(__file__).parent))
|
||
|
||
from forensics.types import ForensicsConfig, TableData, Severity
|
||
from forensics.validator import StatValidator, SCIPY_AVAILABLE
|
||
|
||
print("=" * 60)
|
||
print("Day 6 验证器测试")
|
||
print("=" * 60)
|
||
print(f"scipy 可用: {SCIPY_AVAILABLE}")
|
||
print()
|
||
|
||
|
||
def create_mock_table(table_id: str, data: list[list[str]], caption: str = "") -> TableData:
|
||
"""创建模拟表格数据"""
|
||
return TableData(
|
||
id=table_id,
|
||
caption=caption,
|
||
row_count=len(data),
|
||
col_count=len(data[0]) if data else 0,
|
||
html="<table></table>",
|
||
data=data,
|
||
issues=[],
|
||
skipped=False
|
||
)
|
||
|
||
|
||
def test_ci_pvalue_consistency():
|
||
"""测试 CI vs P 值逻辑一致性检查"""
|
||
print("=" * 40)
|
||
print("测试 1: CI vs P 值逻辑一致性")
|
||
print("=" * 40)
|
||
|
||
config = ForensicsConfig(check_level="L1_L2")
|
||
validator = StatValidator(config)
|
||
|
||
# 测试数据:CI 跨越 1 但 P < 0.05(矛盾)
|
||
data_conflict1 = [
|
||
["Variable", "OR", "95% CI", "P value"],
|
||
["Age", "1.2", "(0.8-1.5)", "P=0.03"], # CI 跨越 1,但 P < 0.05,矛盾
|
||
]
|
||
|
||
table1 = create_mock_table("test_ci_1", data_conflict1, "CI 矛盾测试 1")
|
||
issues1 = validator._validate_ci_pvalue_consistency(table1)
|
||
|
||
print(f" 测试数据: CI=0.8-1.5 (跨越1), P=0.03 (显著)")
|
||
print(f" 期望: 发现 ERROR")
|
||
print(f" 结果: {len(issues1)} 个问题")
|
||
if issues1:
|
||
print(f" - {issues1[0].severity.value}: {issues1[0].message}")
|
||
print()
|
||
|
||
# 测试数据:CI 不跨越 1 且 P < 0.05(正确)
|
||
data_correct = [
|
||
["Variable", "OR", "95% CI", "P value"],
|
||
["Smoking", "2.5", "(1.2-4.8)", "P=0.01"], # CI 不跨越 1,P < 0.05,正确
|
||
]
|
||
|
||
table2 = create_mock_table("test_ci_2", data_correct, "CI 正确测试")
|
||
issues2 = validator._validate_ci_pvalue_consistency(table2)
|
||
|
||
print(f" 测试数据: CI=1.2-4.8 (不跨越1), P=0.01 (显著)")
|
||
print(f" 期望: 无问题")
|
||
print(f" 结果: {len(issues2)} 个问题")
|
||
print()
|
||
|
||
return len(issues1) > 0 and len(issues2) == 0
|
||
|
||
|
||
def test_se_triangle():
|
||
"""测试 SE 三角验证"""
|
||
print("=" * 40)
|
||
print("测试 2: SE 三角验证 (OR/CI/P 一致性)")
|
||
print("=" * 40)
|
||
|
||
if not SCIPY_AVAILABLE:
|
||
print(" 跳过: scipy 不可用")
|
||
return True
|
||
|
||
config = ForensicsConfig(check_level="L1_L2")
|
||
validator = StatValidator(config)
|
||
|
||
# 测试数据:OR=2.5, CI=1.5-4.2, P=0.001
|
||
# 根据 SE 三角公式验证
|
||
# SE = (ln(4.2) - ln(1.5)) / 3.92 = (1.435 - 0.405) / 3.92 = 0.263
|
||
# Z = ln(2.5) / 0.263 = 0.916 / 0.263 = 3.48
|
||
# P = 2 * (1 - norm.cdf(3.48)) ≈ 0.0005
|
||
|
||
data_consistent = [
|
||
["Variable", "OR (95% CI)", "P value"],
|
||
["Diabetes", "OR=2.5 (1.5-4.2)", "P=0.001"], # 应该一致
|
||
]
|
||
|
||
table1 = create_mock_table("test_se_1", data_consistent, "SE 三角一致性测试")
|
||
issues1 = validator._validate_se_triangle(table1)
|
||
|
||
print(f" 测试数据: OR=2.5, CI=1.5-4.2, P=0.001")
|
||
print(f" 结果: {len(issues1)} 个问题")
|
||
for issue in issues1:
|
||
print(f" - {issue.severity.value}: {issue.message}")
|
||
print()
|
||
|
||
# 测试数据:OR=2.5, CI=1.5-4.2, P=0.5(明显矛盾)
|
||
data_conflict = [
|
||
["Variable", "OR (95% CI)", "P value"],
|
||
["Diabetes", "OR=2.5 (1.5-4.2)", "P=0.5"], # P 值严重矛盾
|
||
]
|
||
|
||
table2 = create_mock_table("test_se_2", data_conflict, "SE 三角矛盾测试")
|
||
issues2 = validator._validate_se_triangle(table2)
|
||
|
||
print(f" 测试数据: OR=2.5, CI=1.5-4.2, P=0.5 (矛盾)")
|
||
print(f" 期望: 发现 ERROR")
|
||
print(f" 结果: {len(issues2)} 个问题")
|
||
for issue in issues2:
|
||
print(f" - {issue.severity.value}: {issue.message}")
|
||
if issue.evidence:
|
||
print(f" 证据: P_calculated={issue.evidence.get('p_calculated')}, P_reported={issue.evidence.get('p_reported')}")
|
||
print()
|
||
|
||
return len(issues2) > 0
|
||
|
||
|
||
def test_sd_greater_mean():
|
||
"""测试 SD > Mean 检查"""
|
||
print("=" * 40)
|
||
print("测试 3: SD > Mean 启发式检查")
|
||
print("=" * 40)
|
||
|
||
config = ForensicsConfig(check_level="L1_L2")
|
||
validator = StatValidator(config)
|
||
|
||
# 测试数据:年龄 SD > Mean(明显异常)
|
||
data_abnormal = [
|
||
["Variable", "Group A", "Group B"],
|
||
["Age (years)", "25.0 ± 30.0", "28.0 ± 8.5"], # 第一个 SD > Mean
|
||
]
|
||
|
||
table1 = create_mock_table("test_sd_1", data_abnormal, "SD > Mean 异常测试")
|
||
issues1 = validator._validate_sd_greater_mean(table1)
|
||
|
||
print(f" 测试数据: 年龄 = 25.0 ± 30.0 (SD > Mean)")
|
||
print(f" 期望: 发现 ERROR (年龄是正值指标)")
|
||
print(f" 结果: {len(issues1)} 个问题")
|
||
for issue in issues1:
|
||
print(f" - {issue.severity.value}: {issue.message}")
|
||
print()
|
||
|
||
# 测试数据:正常情况
|
||
data_normal = [
|
||
["Variable", "Group A", "Group B"],
|
||
["Age (years)", "45.0 ± 12.0", "48.0 ± 10.5"], # 正常
|
||
]
|
||
|
||
table2 = create_mock_table("test_sd_2", data_normal, "SD 正常测试")
|
||
issues2 = validator._validate_sd_greater_mean(table2)
|
||
|
||
print(f" 测试数据: 年龄 = 45.0 ± 12.0 (正常)")
|
||
print(f" 期望: 无问题")
|
||
print(f" 结果: {len(issues2)} 个问题")
|
||
print()
|
||
|
||
return len(issues1) > 0 and len(issues2) == 0
|
||
|
||
|
||
def test_ttest_validation():
|
||
"""测试 T 检验逆向验证"""
|
||
print("=" * 40)
|
||
print("测试 4: T 检验逆向验证")
|
||
print("=" * 40)
|
||
|
||
if not SCIPY_AVAILABLE:
|
||
print(" 跳过: scipy 不可用")
|
||
return True
|
||
|
||
config = ForensicsConfig(check_level="L1_L2")
|
||
validator = StatValidator(config)
|
||
|
||
# 测试数据:包含样本量的表头
|
||
# 真实 t 检验:M1=45, SD1=10, n1=50; M2=50, SD2=12, n2=48
|
||
# t = (50-45) / sqrt(10²/50 + 12²/48) = 5 / sqrt(2 + 3) = 5/2.24 = 2.23
|
||
# P ≈ 0.028
|
||
|
||
data_with_n = [
|
||
["Variable", "Group A (n=50)", "Group B (n=48)", "P value"],
|
||
["Score", "45.0 ± 10.0", "50.0 ± 12.0", "P=0.03"], # 接近正确
|
||
]
|
||
|
||
table1 = create_mock_table("test_t_1", data_with_n, "T 检验测试")
|
||
issues1 = validator._validate_ttest(table1)
|
||
|
||
print(f" 测试数据: Group A: 45.0±10.0 (n=50), Group B: 50.0±12.0 (n=48), P=0.03")
|
||
print(f" 结果: {len(issues1)} 个问题")
|
||
for issue in issues1:
|
||
print(f" - {issue.severity.value}: {issue.message}")
|
||
print()
|
||
|
||
return True
|
||
|
||
|
||
def run_all_tests():
|
||
"""运行所有测试"""
|
||
results = []
|
||
|
||
results.append(("CI vs P 值一致性", test_ci_pvalue_consistency()))
|
||
results.append(("SE 三角验证", test_se_triangle()))
|
||
results.append(("SD > Mean 检查", test_sd_greater_mean()))
|
||
results.append(("T 检验逆向验证", test_ttest_validation()))
|
||
|
||
print("=" * 60)
|
||
print("测试结果汇总")
|
||
print("=" * 60)
|
||
|
||
all_passed = True
|
||
for name, passed in results:
|
||
status = "✅ PASS" if passed else "❌ FAIL"
|
||
print(f" {name}: {status}")
|
||
if not passed:
|
||
all_passed = False
|
||
|
||
print()
|
||
if all_passed:
|
||
print("🎉 所有测试通过!Day 6 验证器实现完成。")
|
||
else:
|
||
print("⚠️ 部分测试失败,请检查代码。")
|
||
|
||
return all_passed
|
||
|
||
|
||
if __name__ == "__main__":
|
||
success = run_all_tests()
|
||
sys.exit(0 if success else 1)
|