feat(rvw): Complete V2.0 Week 3 - Statistical validation extension and UX improvements

Week 3 Development Summary:

- Implement negative sign normalization (6 Unicode variants)

- Enhance T-test validation with smart sample size extraction

- Enhance SE triangle and CI-P consistency validation with subrow support

- Add precise sub-cell highlighting for P-values in multi-line cells

- Add frontend issue type Chinese translations (6 new types)

- Add file format tips for PDF/DOC uploads

Technical improvements:

- Add _clean_statistical_text() in extractor.py

- Add _safe_float() wrapper in validator.py

- Add ForensicsReport.tsx component

- Update ISSUE_TYPE_LABELS translations

Documentation:

- Add 2026-02-18 development record

- Update RVW module status (v5.1)

- Update system status (v5.2)

Status: Week 3 complete, ready for Week 4 testing
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-02-18 18:26:16 +08:00
parent 9f256c4a02
commit f9ed0c2528
36 changed files with 2790 additions and 501 deletions

View File

@@ -173,7 +173,7 @@ async def analyze_docx(
f"耗时: {execution_time_ms}ms"
)
return JSONResponse(content=result.model_dump())
return JSONResponse(content=result.model_dump(by_alias=True))
except HTTPException:
raise

View File

@@ -44,6 +44,12 @@ EFFECT_SIZE_PATTERN = re.compile(
re.IGNORECASE
)
# 卡方值匹配,如 "χ²=57.519" 或 "2=57.519" 或 "χ2=57.519"
CHI_SQUARE_PATTERN = re.compile(
r"(?:χ[²2]|[χx]2|2)\s*[=:]\s*(\d+\.?\d*)",
re.IGNORECASE
)
# ==================== 统计方法检测 ====================

View File

@@ -225,8 +225,8 @@ class DocxTableExtractor:
if col_idx >= num_cols:
break
# 获取单元格文本
cell_text = self._get_cell_text(cell)
# 获取单元格文本(保留换行符用于 HTML 显示)
cell_text = self._get_cell_text(cell, use_newline=True)
# 检测合并范围
# python-docx 中合并单元格会重复出现同一个 cell 对象
@@ -253,13 +253,123 @@ class DocxTableExtractor:
return data
def _get_cell_text(self, cell: _Cell) -> str:
# Symbol 字体字符映射表Word 使用 Symbol 字体表示希腊字母等)
SYMBOL_CHAR_MAP = {
'F063': 'χ', # chi
'F032': '²', # superscript 2
'F061': 'α', # alpha
'F062': 'β', # beta
'F067': 'γ', # gamma
'F064': 'δ', # delta
'F065': 'ε', # epsilon
'F06D': 'μ', # mu
'F073': 'σ', # sigma
'F070': 'π', # pi
'F0B2': '²', # another superscript 2 encoding
}
def _clean_statistical_text(self, text: str) -> str:
"""
清洗统计学文本中的特殊字符
关键清洗:
1. 负号归一化(最重要!防止 float() 崩溃)
2. 比较符归一化
3. 零宽字符清理
"""
if not text:
return ""
# 1. 负号归一化(极高危!)
# Word 会自动把连字符转成破折号或数学减号,导致 float() 报错
text = text.replace('\u2212', '-') # 数学减号 (Minus Sign)
text = text.replace('\u2013', '-') # En Dash
text = text.replace('\u2014', '-') # Em Dash
text = text.replace('\u2010', '-') # Hyphen
text = text.replace('\u2011', '-') # Non-Breaking Hyphen
text = text.replace('\u00ad', '-') # Soft Hyphen
# 2. 比较符归一化
text = text.replace('\u2264', '<=') # ≤
text = text.replace('\u2265', '>=') # ≥
text = text.replace('\u2260', '!=') # ≠
text = text.replace('\u2248', '~=') # ≈
# 3. 加减号归一化
# 保留 ± 原样,因为它在统计学中有特定含义(如 mean±SD
# text = text.replace('\u00b1', '+/-') # ±
# 4. 乘号归一化
text = text.replace('\u00d7', 'x') # ×
text = text.replace('\u2217', '*') # (asterisk operator)
# 5. 零宽字符清理
text = text.replace('\u200b', '') # Zero-Width Space
text = text.replace('\u200c', '') # Zero-Width Non-Joiner
text = text.replace('\u200d', '') # Zero-Width Joiner
text = text.replace('\ufeff', '') # BOM / Zero-Width No-Break Space
text = text.replace('\u00a0', ' ') # Non-Breaking Space -> 普通空格
return text
def _get_cell_text(self, cell: _Cell, use_newline: bool = False) -> str:
"""
获取单元格文本(合并多个段落)
Args:
cell: Word 单元格对象
use_newline: 是否使用换行符连接段落(用于 HTML 显示)
注意:会处理 Word 的 <w:sym> 符号字符(如 χ² 等)
"""
paragraphs = cell.paragraphs
texts = [p.text.strip() for p in paragraphs]
return " ".join(texts).strip()
texts = []
for para in paragraphs:
# 使用增强的文本提取(处理符号字符)
para_text = self._extract_paragraph_text(para)
if para_text.strip():
texts.append(para_text.strip())
separator = "\n" if use_newline else " "
raw_text = separator.join(texts).strip()
# 清洗统计学特殊字符(负号归一化等)
return self._clean_statistical_text(raw_text)
def _extract_paragraph_text(self, para: Paragraph) -> str:
"""
从段落中提取完整文本,包括 <w:sym> 符号字符
Word 使用 <w:sym w:font="Symbol" w:char="F063"/> 表示 χ 等符号,
python-docx 的 paragraph.text 不会提取这些内容。
"""
from docx.oxml.ns import qn
text_parts = []
# 遍历段落中的所有 run 元素
for run in para._p.iter():
# 处理普通文本
if run.tag == qn('w:t'):
text_parts.append(run.text or '')
# 处理符号字符 <w:sym>
elif run.tag == qn('w:sym'):
font = run.get(qn('w:font'))
char_code = run.get(qn('w:char'))
if font == 'Symbol' and char_code:
# 查找映射
unicode_char = self.SYMBOL_CHAR_MAP.get(char_code.upper(), '')
if unicode_char:
text_parts.append(unicode_char)
else:
# 未知符号,记录警告
logger.debug(f"Unknown Symbol char: {char_code}")
text_parts.append(f'[SYM:{char_code}]')
return ''.join(text_parts)
def _generate_html(
self,
@@ -296,8 +406,10 @@ class DocxTableExtractor:
html_parts.append(" <tr>")
for col_idx, cell in enumerate(row, start=1):
coord = f"R{row_idx}C{col_idx}"
# 为每个子行添加 span 标记,支持细粒度高亮
cell_html = self._escape_html_with_subrows(cell, coord)
html_parts.append(
f' <td data-coord="{coord}">{self._escape_html(cell)}</td>'
f' <td data-coord="{coord}">{cell_html}</td>'
)
html_parts.append(" </tr>")
html_parts.append(" </tbody>")
@@ -307,7 +419,43 @@ class DocxTableExtractor:
return "\n".join(html_parts)
def _escape_html(self, text: str) -> str:
"""转义 HTML 特殊字符"""
"""转义 HTML 特殊字符,并将换行符转换为 <br>"""
escaped = (
text
.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace('"', "&quot;")
.replace("'", "&#39;")
)
# 将换行符转换为 <br> 标签,保留表格中的多行结构
return escaped.replace("\n", "<br>")
def _escape_html_with_subrows(self, text: str, coord: str) -> str:
"""
转义 HTML 并为每个子行添加 span 标记,支持细粒度高亮
例如:单元格内容 "0.017\n0.01\n<0.001" 会生成:
<span data-subcoord="R5C5S1">0.017</span><br>
<span data-subcoord="R5C5S2">0.01</span><br>
<span data-subcoord="R5C5S3">&lt;0.001</span>
"""
lines = text.split("\n")
if len(lines) == 1:
# 单行内容,直接转义
return self._escape_single(text)
# 多行内容,为每行添加 span
result_parts = []
for idx, line in enumerate(lines, start=1):
escaped_line = self._escape_single(line)
subcoord = f"{coord}S{idx}"
result_parts.append(f'<span data-subcoord="{subcoord}">{escaped_line}</span>')
return "<br>".join(result_parts)
def _escape_single(self, text: str) -> str:
"""转义单行文本的 HTML 特殊字符"""
return (
text
.replace("&", "&amp;")

View File

@@ -4,7 +4,7 @@
定义所有数据结构,确保类型安全和接口一致性。
"""
from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, model_serializer
from typing import List, Dict, Any, Optional
from enum import Enum
@@ -59,15 +59,30 @@ class ForensicsConfig(BaseModel):
class CellLocation(BaseModel):
"""单元格位置R1C1 坐标)"""
table_id: str = Field(..., description="表格 ID如 tbl_0")
"""单元格位置R1C1 坐标),支持单元格内子行定位"""
table_id: str = Field(..., alias="tableId", description="表格 ID如 tbl_0")
row: int = Field(..., description="行号,从 1 开始")
col: int = Field(..., description="列号,从 1 开始")
subrow: Optional[int] = Field(None, description="单元格内子行号,从 1 开始(用于多行单元格)")
@property
def cell_ref(self) -> str:
"""返回 R1C1 格式的坐标"""
return f"R{self.row}C{self.col}"
model_config = {"populate_by_name": True}
@model_serializer
def serialize(self) -> Dict[str, Any]:
"""序列化时自动添加 cellRef 字段,支持子行坐标"""
# 基础坐标R{row}C{col}
# 子行坐标R{row}C{col}S{subrow}
cell_ref = f"R{self.row}C{self.col}"
if self.subrow is not None:
cell_ref += f"S{self.subrow}"
return {
"tableId": self.table_id,
"row": self.row,
"col": self.col,
"subrow": self.subrow,
"cellRef": cell_ref
}
class Issue(BaseModel):
@@ -84,26 +99,30 @@ class TableData(BaseModel):
id: str = Field(..., description="表格 ID如 tbl_0")
caption: Optional[str] = Field(None, description="表格标题")
type: Optional[str] = Field(None, description="表格类型BASELINE/OUTCOME/OTHER")
row_count: int = Field(..., description="行数")
col_count: int = Field(..., description="列数")
row_count: int = Field(..., alias="rowCount", description="行数")
col_count: int = Field(..., alias="colCount", description="列数")
html: str = Field(..., description="预渲染的 HTML 片段")
data: List[List[str]] = Field(..., description="二维数组数据")
issues: List[Issue] = Field(default_factory=list, description="该表格的问题列表")
skipped: bool = Field(default=False, description="是否被跳过(超限)")
skip_reason: Optional[str] = Field(None, description="跳过原因")
skip_reason: Optional[str] = Field(None, alias="skipReason", description="跳过原因")
model_config = {"populate_by_name": True}
class ForensicsResult(BaseModel):
"""数据侦探分析结果"""
success: bool = Field(..., description="是否成功")
methods_found: List[str] = Field(default_factory=list, description="检测到的统计方法")
methods_found: List[str] = Field(default_factory=list, alias="methodsFound", description="检测到的统计方法")
tables: List[TableData] = Field(default_factory=list, description="表格列表")
total_issues: int = Field(default=0, description="总问题数")
error_count: int = Field(default=0, description="ERROR 级别问题数")
warning_count: int = Field(default=0, description="WARNING 级别问题数")
execution_time_ms: int = Field(default=0, description="执行时间(毫秒)")
total_issues: int = Field(default=0, alias="totalIssues", description="总问题数")
error_count: int = Field(default=0, alias="errorCount", description="ERROR 级别问题数")
warning_count: int = Field(default=0, alias="warningCount", description="WARNING 级别问题数")
execution_time_ms: int = Field(default=0, alias="executionTimeMs", description="执行时间(毫秒)")
error: Optional[str] = Field(None, description="错误信息(如果失败)")
fallback_available: bool = Field(default=True, description="是否可降级执行")
fallback_available: bool = Field(default=True, alias="fallbackAvailable", description="是否可降级执行")
model_config = {"populate_by_name": True}
class ExtractionError(Exception):

File diff suppressed because it is too large Load Diff

View File

@@ -52,9 +52,6 @@ app.add_middleware(
TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service"))
TEMP_DIR.mkdir(parents=True, exist_ok=True)
# 注册 RVW V2.0 数据侦探路由
app.include_router(forensics_router)
# 导入服务模块
from services.pdf_extractor import extract_pdf_pymupdf
from services.pdf_processor import extract_pdf, get_pdf_processing_strategy
@@ -71,6 +68,7 @@ from services.doc_export_service import check_pandoc_available, convert_markdown
# 新增RVW V2.0 数据侦探模块
from forensics.api import router as forensics_router
app.include_router(forensics_router)
# 兼容nougat 相关(已废弃,保留空实现避免报错)
def check_nougat_available(): return False