feat(rvw): Implement RVW V2.0 Data Forensics Module - Day 6 StatValidator

Summary:
- Implement L2 Statistical Validator (CI-P consistency, T-test reverse)
- Implement L2.5 Consistency Forensics (SE Triangle, SD>Mean check)
- Add error/warning severity classification with tolerance thresholds
- Support 5+ CI formats parsing (parentheses, brackets, 95% CI prefix)
- Complete Python forensics service (types, config, validator, extractor)

V2.0 Development Progress (Week 2 Day 6):
- Day 1-5: Python service setup, Word table extraction, L1 arithmetic validator
- Day 6: L2 StatValidator + L2.5 consistency forensics (promoted from V2.1)

Test Results:
- Unit tests: 4/4 passed (CI-P, SE Triangle, SD>Mean, T-test)
- Real document tests: 5/5 successful, 2 reasonable WARNINGs

Status: Day 6 completed, ready for Day 7 (Skills Framework)
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-02-17 22:15:27 +08:00
parent 7a299e8562
commit e785969e54
31 changed files with 5925 additions and 15 deletions

View File

@@ -0,0 +1,340 @@
"""
数据侦探模块 - Word 表格提取器
使用 python-docx 解析 Word 文档,提取表格数据并生成 HTML 片段。
功能:
- 解析 Word DOM 结构
- 处理合并单元格Forward Fill 策略)
- 关联表格 Caption向前回溯
- 生成 HTML 片段(含 data-coord 属性)
"""
from docx import Document
from docx.table import Table, _Cell
from docx.text.paragraph import Paragraph
from loguru import logger
from typing import List, Optional, Tuple
import re
from .types import TableData, Issue, Severity, IssueType, CellLocation, ForensicsConfig
from .config import (
MAX_TABLE_ROWS,
MAX_TABLES_PER_DOC,
BASELINE_KEYWORDS,
OUTCOME_KEYWORDS,
)
class DocxTableExtractor:
"""
Word 表格提取器
提取 .docx 文件中的所有表格,处理合并单元格,生成 HTML 片段。
"""
def __init__(self, config: ForensicsConfig):
self.config = config
self.max_table_rows = config.max_table_rows
def extract(self, file_path: str) -> Tuple[List[TableData], str]:
"""
提取 Word 文档中的所有表格
Args:
file_path: .docx 文件路径
Returns:
(tables, full_text): 表格列表和全文文本
"""
logger.info(f"开始提取表格: {file_path}")
try:
doc = Document(file_path)
except Exception as e:
logger.error(f"无法打开 Word 文档: {e}")
raise ValueError(f"无法打开 Word 文档: {e}")
tables: List[TableData] = []
full_text_parts: List[str] = []
# 收集所有段落文本(用于方法检测)
for para in doc.paragraphs:
full_text_parts.append(para.text)
# 遍历文档元素,关联表格和 Caption
table_index = 0
prev_paragraphs: List[str] = []
for element in doc.element.body:
# 段落元素
if element.tag.endswith('p'):
para = Paragraph(element, doc)
prev_paragraphs.append(para.text.strip())
# 只保留最近 3 个段落用于 Caption 匹配
if len(prev_paragraphs) > 3:
prev_paragraphs.pop(0)
# 表格元素
elif element.tag.endswith('tbl'):
if table_index >= MAX_TABLES_PER_DOC:
logger.warning(f"表格数量超过限制 ({MAX_TABLES_PER_DOC}),跳过剩余表格")
break
# 获取 python-docx Table 对象
table = Table(element, doc)
# 提取 Caption
caption = self._find_caption(prev_paragraphs)
# 提取表格数据
table_data = self._extract_table(
table=table,
table_id=f"tbl_{table_index}",
caption=caption
)
tables.append(table_data)
table_index += 1
# 清空前置段落
prev_paragraphs = []
full_text = "\n".join(full_text_parts)
logger.info(f"提取完成: {len(tables)} 个表格, {len(full_text)} 字符")
return tables, full_text
def _find_caption(self, prev_paragraphs: List[str]) -> Optional[str]:
"""
从前置段落中查找表格 Caption
匹配模式:
- "Table 1. xxx""表 1 xxx"
- "Table 1: xxx"
"""
caption_pattern = re.compile(
r"^(Table|表)\s*\d+[\.:\s]",
re.IGNORECASE
)
# 从后向前查找
for para in reversed(prev_paragraphs):
if para and caption_pattern.match(para):
return para
return None
def _extract_table(
self,
table: Table,
table_id: str,
caption: Optional[str]
) -> TableData:
"""
提取单个表格数据
Args:
table: python-docx Table 对象
table_id: 表格 ID
caption: 表格标题
Returns:
TableData 对象
"""
rows = table.rows
row_count = len(rows)
col_count = len(rows[0].cells) if rows else 0
# 检查是否超过行数限制
if row_count > self.max_table_rows:
logger.warning(f"表格 {table_id} 行数 ({row_count}) 超过限制 ({self.max_table_rows}),跳过")
return TableData(
id=table_id,
caption=caption,
type=self._detect_table_type(caption),
row_count=row_count,
col_count=col_count,
html=f"<p class='warning'>表格行数 ({row_count}) 超过限制 ({self.max_table_rows}),已跳过</p>",
data=[],
issues=[
Issue(
severity=Severity.WARNING,
type=IssueType.TABLE_SKIPPED,
message=f"表格行数 ({row_count}) 超过限制 ({self.max_table_rows})",
location=CellLocation(table_id=table_id, row=1, col=1),
evidence={"row_count": row_count, "max_rows": self.max_table_rows}
)
],
skipped=True,
skip_reason=f"行数超限: {row_count} > {self.max_table_rows}"
)
# 提取原始数据(处理合并单元格)
data = self._extract_with_merge_handling(table)
# 生成 HTML
html = self._generate_html(table_id, caption, data)
# 检测表格类型
table_type = self._detect_table_type(caption)
return TableData(
id=table_id,
caption=caption,
type=table_type,
row_count=len(data),
col_count=len(data[0]) if data else 0,
html=html,
data=data,
issues=[],
skipped=False,
skip_reason=None
)
def _extract_with_merge_handling(self, table: Table) -> List[List[str]]:
"""
提取表格数据,处理合并单元格
使用 Forward Fill 策略:
- 水平合并:将值复制到所有合并的单元格
- 垂直合并:将上方单元格的值填充到下方
"""
rows = table.rows
if not rows:
return []
# 首先获取表格的真实维度
num_rows = len(rows)
num_cols = len(rows[0].cells)
# 初始化数据矩阵
data: List[List[str]] = [["" for _ in range(num_cols)] for _ in range(num_rows)]
# 记录每个单元格是否已被处理(用于处理合并单元格)
processed = [[False for _ in range(num_cols)] for _ in range(num_rows)]
for row_idx, row in enumerate(rows):
col_idx = 0
for cell in row.cells:
# 跳过已处理的单元格(合并单元格的一部分)
while col_idx < num_cols and processed[row_idx][col_idx]:
col_idx += 1
if col_idx >= num_cols:
break
# 获取单元格文本
cell_text = self._get_cell_text(cell)
# 检测合并范围
# python-docx 中合并单元格会重复出现同一个 cell 对象
# 我们通过比较 cell._tc 来检测
merge_width = 1
merge_height = 1
# 检测水平合并
for next_col in range(col_idx + 1, num_cols):
if next_col < len(row.cells):
next_cell = row.cells[next_col]
if next_cell._tc is cell._tc:
merge_width += 1
else:
break
# 填充数据
for r in range(row_idx, min(row_idx + merge_height, num_rows)):
for c in range(col_idx, min(col_idx + merge_width, num_cols)):
data[r][c] = cell_text
processed[r][c] = True
col_idx += merge_width
return data
def _get_cell_text(self, cell: _Cell) -> str:
"""
获取单元格文本(合并多个段落)
"""
paragraphs = cell.paragraphs
texts = [p.text.strip() for p in paragraphs]
return " ".join(texts).strip()
def _generate_html(
self,
table_id: str,
caption: Optional[str],
data: List[List[str]]
) -> str:
"""
生成 HTML 片段,包含 data-coord 属性用于前端高亮
"""
if not data:
return f"<table id='{table_id}' class='forensics-table'><tr><td>空表格</td></tr></table>"
html_parts = [f"<table id='{table_id}' class='forensics-table'>"]
# 添加 Caption
if caption:
html_parts.append(f" <caption>{self._escape_html(caption)}</caption>")
# 添加表头(假设第一行是表头)
html_parts.append(" <thead>")
html_parts.append(" <tr>")
for col_idx, cell in enumerate(data[0], start=1):
coord = f"R1C{col_idx}"
html_parts.append(
f' <th data-coord="{coord}">{self._escape_html(cell)}</th>'
)
html_parts.append(" </tr>")
html_parts.append(" </thead>")
# 添加表体
html_parts.append(" <tbody>")
for row_idx, row in enumerate(data[1:], start=2):
html_parts.append(" <tr>")
for col_idx, cell in enumerate(row, start=1):
coord = f"R{row_idx}C{col_idx}"
html_parts.append(
f' <td data-coord="{coord}">{self._escape_html(cell)}</td>'
)
html_parts.append(" </tr>")
html_parts.append(" </tbody>")
html_parts.append("</table>")
return "\n".join(html_parts)
def _escape_html(self, text: str) -> str:
"""转义 HTML 特殊字符"""
return (
text
.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace('"', "&quot;")
.replace("'", "&#39;")
)
def _detect_table_type(self, caption: Optional[str]) -> str:
"""
检测表格类型
Returns:
BASELINE / OUTCOME / OTHER
"""
if not caption:
return "OTHER"
caption_lower = caption.lower()
for keyword in BASELINE_KEYWORDS:
if keyword in caption_lower:
return "BASELINE"
for keyword in OUTCOME_KEYWORDS:
if keyword in caption_lower:
return "OUTCOME"
return "OTHER"