feat(rvw): Complete V2.0 Week 3 - Statistical validation extension and UX improvements

Week 3 Development Summary: - Implement negative sign normalization (6 Unicode variants) - Enhance T-test validation with smart sample size extraction - Enhance SE triangle and CI-P consistency validation with subrow support - Add precise sub-cell highlighting for P-values in multi-line cells - Add frontend issue type Chinese translations (6 new types) - Add file format tips for PDF/DOC uploads Technical improvements: - Add _clean_statistical_text() in extractor.py - Add _safe_float() wrapper in validator.py - Add ForensicsReport.tsx component - Update ISSUE_TYPE_LABELS translations Documentation: - Add 2026-02-18 development record - Update RVW module status (v5.1) - Update system status (v5.2) Status: Week 3 complete, ready for Week 4 testing Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-18 18:26:16 +08:00
parent 9f256c4a02
commit f9ed0c2528
36 changed files with 2790 additions and 501 deletions
--- a/extraction_service/forensics/api.py
+++ b/extraction_service/forensics/api.py
@@ -173,7 +173,7 @@ async def analyze_docx(
            f"耗时: {execution_time_ms}ms"
        )
        
-        return JSONResponse(content=result.model_dump())
+        return JSONResponse(content=result.model_dump(by_alias=True))
    
    except HTTPException:
        raise
--- a/extraction_service/forensics/config.py
+++ b/extraction_service/forensics/config.py
@@ -44,6 +44,12 @@ EFFECT_SIZE_PATTERN = re.compile(
    re.IGNORECASE
 )

+# 卡方值匹配，如 "χ²=57.519" 或 "2=57.519" 或 "χ2=57.519"
+CHI_SQUARE_PATTERN = re.compile(
+    r"(?:χ[²2]|[χx]2|2)\s*[=:]\s*(\d+\.?\d*)",
+    re.IGNORECASE
+)
+

 # ==================== 统计方法检测 ====================

--- a/extraction_service/forensics/extractor.py
+++ b/extraction_service/forensics/extractor.py
@@ -225,8 +225,8 @@ class DocxTableExtractor:
                if col_idx >= num_cols:
                    break
                
-                # 获取单元格文本
-                cell_text = self._get_cell_text(cell)
+                # 获取单元格文本（保留换行符用于 HTML 显示）
+                cell_text = self._get_cell_text(cell, use_newline=True)
                
                # 检测合并范围
                # python-docx 中合并单元格会重复出现同一个 cell 对象
@@ -253,13 +253,123 @@ class DocxTableExtractor:
        
        return data
    
-    def _get_cell_text(self, cell: _Cell) -> str:
+    # Symbol 字体字符映射表（Word 使用 Symbol 字体表示希腊字母等）
+    SYMBOL_CHAR_MAP = {
+        'F063': 'χ',   # chi
+        'F032': '²',   # superscript 2
+        'F061': 'α',   # alpha
+        'F062': 'β',   # beta
+        'F067': 'γ',   # gamma
+        'F064': 'δ',   # delta
+        'F065': 'ε',   # epsilon
+        'F06D': 'μ',   # mu
+        'F073': 'σ',   # sigma
+        'F070': 'π',   # pi
+        'F0B2': '²',   # another superscript 2 encoding
+    }
+    
+    def _clean_statistical_text(self, text: str) -> str:
+        """
+        清洗统计学文本中的特殊字符
+        
+        关键清洗：
+        1. 负号归一化（最重要！防止 float() 崩溃）
+        2. 比较符归一化
+        3. 零宽字符清理
+        """
+        if not text:
+            return ""
+        
+        # 1. 负号归一化（极高危！）
+        # Word 会自动把连字符转成破折号或数学减号，导致 float() 报错
+        text = text.replace('\u2212', '-')  # 数学减号 (Minus Sign)
+        text = text.replace('\u2013', '-')  # En Dash
+        text = text.replace('\u2014', '-')  # Em Dash
+        text = text.replace('\u2010', '-')  # Hyphen
+        text = text.replace('\u2011', '-')  # Non-Breaking Hyphen
+        text = text.replace('\u00ad', '-')  # Soft Hyphen
+        
+        # 2. 比较符归一化
+        text = text.replace('\u2264', '<=')  # ≤
+        text = text.replace('\u2265', '>=')  # ≥
+        text = text.replace('\u2260', '!=')  # ≠
+        text = text.replace('\u2248', '~=')  # ≈
+        
+        # 3. 加减号归一化
+        # 保留 ± 原样，因为它在统计学中有特定含义（如 mean±SD）
+        # text = text.replace('\u00b1', '+/-')  # ±
+        
+        # 4. 乘号归一化
+        text = text.replace('\u00d7', 'x')   # ×
+        text = text.replace('\u2217', '*')   # ∗ (asterisk operator)
+        
+        # 5. 零宽字符清理
+        text = text.replace('\u200b', '')    # Zero-Width Space
+        text = text.replace('\u200c', '')    # Zero-Width Non-Joiner
+        text = text.replace('\u200d', '')    # Zero-Width Joiner
+        text = text.replace('\ufeff', '')    # BOM / Zero-Width No-Break Space
+        text = text.replace('\u00a0', ' ')   # Non-Breaking Space -> 普通空格
+        
+        return text
+    
+    def _get_cell_text(self, cell: _Cell, use_newline: bool = False) -> str:
        """
        获取单元格文本（合并多个段落）
+        
+        Args:
+            cell: Word 单元格对象
+            use_newline: 是否使用换行符连接段落（用于 HTML 显示）
+        
+        注意：会处理 Word 的 <w:sym> 符号字符（如 χ² 等）
        """
        paragraphs = cell.paragraphs
-        texts = [p.text.strip() for p in paragraphs]
-        return " ".join(texts).strip()
+        texts = []
+        
+        for para in paragraphs:
+            # 使用增强的文本提取（处理符号字符）
+            para_text = self._extract_paragraph_text(para)
+            if para_text.strip():
+                texts.append(para_text.strip())
+        
+        separator = "\n" if use_newline else " "
+        raw_text = separator.join(texts).strip()
+        
+        # 清洗统计学特殊字符（负号归一化等）
+        return self._clean_statistical_text(raw_text)
+    
+    def _extract_paragraph_text(self, para: Paragraph) -> str:
+        """
+        从段落中提取完整文本，包括 <w:sym> 符号字符
+        
+        Word 使用 <w:sym w:font="Symbol" w:char="F063"/> 表示 χ 等符号，
+        python-docx 的 paragraph.text 不会提取这些内容。
+        """
+        from docx.oxml.ns import qn
+        
+        text_parts = []
+        
+        # 遍历段落中的所有 run 元素
+        for run in para._p.iter():
+            # 处理普通文本
+            if run.tag == qn('w:t'):
+                text_parts.append(run.text or '')
+            
+            # 处理符号字符 <w:sym>
+            elif run.tag == qn('w:sym'):
+                font = run.get(qn('w:font'))
+                char_code = run.get(qn('w:char'))
+                
+                if font == 'Symbol' and char_code:
+                    # 查找映射
+                    unicode_char = self.SYMBOL_CHAR_MAP.get(char_code.upper(), '')
+                    if unicode_char:
+                        text_parts.append(unicode_char)
+                    else:
+                        # 未知符号，记录警告
+                        logger.debug(f"Unknown Symbol char: {char_code}")
+                        text_parts.append(f'[SYM:{char_code}]')
+        
+        return ''.join(text_parts)
    
    def _generate_html(
        self,
@@ -296,8 +406,10 @@ class DocxTableExtractor:
            html_parts.append("    <tr>")
            for col_idx, cell in enumerate(row, start=1):
                coord = f"R{row_idx}C{col_idx}"
+                # 为每个子行添加 span 标记，支持细粒度高亮
+                cell_html = self._escape_html_with_subrows(cell, coord)
                html_parts.append(
-                    f'      <td data-coord="{coord}">{self._escape_html(cell)}</td>'
+                    f'      <td data-coord="{coord}">{cell_html}</td>'
                )
            html_parts.append("    </tr>")
        html_parts.append("  </tbody>")
@@ -307,7 +419,43 @@ class DocxTableExtractor:
        return "\n".join(html_parts)
    
    def _escape_html(self, text: str) -> str:
-        """转义 HTML 特殊字符"""
+        """转义 HTML 特殊字符，并将换行符转换为 <br>"""
+        escaped = (
+            text
+            .replace("&", "&amp;")
+            .replace("<", "&lt;")
+            .replace(">", "&gt;")
+            .replace('"', "&quot;")
+            .replace("'", "&#39;")
+        )
+        # 将换行符转换为 <br> 标签，保留表格中的多行结构
+        return escaped.replace("\n", "<br>")
+    
+    def _escape_html_with_subrows(self, text: str, coord: str) -> str:
+        """
+        转义 HTML 并为每个子行添加 span 标记，支持细粒度高亮
+        
+        例如：单元格内容 "0.017\n0.01\n<0.001" 会生成：
+        <span data-subcoord="R5C5S1">0.017</span><br>
+        <span data-subcoord="R5C5S2">0.01</span><br>
+        <span data-subcoord="R5C5S3">&lt;0.001</span>
+        """
+        lines = text.split("\n")
+        if len(lines) == 1:
+            # 单行内容，直接转义
+            return self._escape_single(text)
+        
+        # 多行内容，为每行添加 span
+        result_parts = []
+        for idx, line in enumerate(lines, start=1):
+            escaped_line = self._escape_single(line)
+            subcoord = f"{coord}S{idx}"
+            result_parts.append(f'<span data-subcoord="{subcoord}">{escaped_line}</span>')
+        
+        return "<br>".join(result_parts)
+    
+    def _escape_single(self, text: str) -> str:
+        """转义单行文本的 HTML 特殊字符"""
        return (
            text
            .replace("&", "&amp;")
--- a/extraction_service/forensics/types.py
+++ b/extraction_service/forensics/types.py
@@ -4,7 +4,7 @@
 定义所有数据结构，确保类型安全和接口一致性。
 """

-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_serializer
 from typing import List, Dict, Any, Optional
 from enum import Enum

@@ -59,15 +59,30 @@ class ForensicsConfig(BaseModel):


 class CellLocation(BaseModel):
-    """单元格位置（R1C1 坐标）"""
-    table_id: str = Field(..., description="表格 ID，如 tbl_0")
+    """单元格位置（R1C1 坐标），支持单元格内子行定位"""
+    table_id: str = Field(..., alias="tableId", description="表格 ID，如 tbl_0")
    row: int = Field(..., description="行号，从 1 开始")
    col: int = Field(..., description="列号，从 1 开始")
+    subrow: Optional[int] = Field(None, description="单元格内子行号，从 1 开始（用于多行单元格）")
    
-    @property
-    def cell_ref(self) -> str:
-        """返回 R1C1 格式的坐标"""
-        return f"R{self.row}C{self.col}"
+    model_config = {"populate_by_name": True}
+    
+    @model_serializer
+    def serialize(self) -> Dict[str, Any]:
+        """序列化时自动添加 cellRef 字段，支持子行坐标"""
+        # 基础坐标：R{row}C{col}
+        # 子行坐标：R{row}C{col}S{subrow}
+        cell_ref = f"R{self.row}C{self.col}"
+        if self.subrow is not None:
+            cell_ref += f"S{self.subrow}"
+        
+        return {
+            "tableId": self.table_id,
+            "row": self.row,
+            "col": self.col,
+            "subrow": self.subrow,
+            "cellRef": cell_ref
+        }


 class Issue(BaseModel):
@@ -84,26 +99,30 @@ class TableData(BaseModel):
    id: str = Field(..., description="表格 ID，如 tbl_0")
    caption: Optional[str] = Field(None, description="表格标题")
    type: Optional[str] = Field(None, description="表格类型：BASELINE/OUTCOME/OTHER")
-    row_count: int = Field(..., description="行数")
-    col_count: int = Field(..., description="列数")
+    row_count: int = Field(..., alias="rowCount", description="行数")
+    col_count: int = Field(..., alias="colCount", description="列数")
    html: str = Field(..., description="预渲染的 HTML 片段")
    data: List[List[str]] = Field(..., description="二维数组数据")
    issues: List[Issue] = Field(default_factory=list, description="该表格的问题列表")
    skipped: bool = Field(default=False, description="是否被跳过（超限）")
-    skip_reason: Optional[str] = Field(None, description="跳过原因")
+    skip_reason: Optional[str] = Field(None, alias="skipReason", description="跳过原因")
+    
+    model_config = {"populate_by_name": True}


 class ForensicsResult(BaseModel):
    """数据侦探分析结果"""
    success: bool = Field(..., description="是否成功")
-    methods_found: List[str] = Field(default_factory=list, description="检测到的统计方法")
+    methods_found: List[str] = Field(default_factory=list, alias="methodsFound", description="检测到的统计方法")
    tables: List[TableData] = Field(default_factory=list, description="表格列表")
-    total_issues: int = Field(default=0, description="总问题数")
-    error_count: int = Field(default=0, description="ERROR 级别问题数")
-    warning_count: int = Field(default=0, description="WARNING 级别问题数")
-    execution_time_ms: int = Field(default=0, description="执行时间（毫秒）")
+    total_issues: int = Field(default=0, alias="totalIssues", description="总问题数")
+    error_count: int = Field(default=0, alias="errorCount", description="ERROR 级别问题数")
+    warning_count: int = Field(default=0, alias="warningCount", description="WARNING 级别问题数")
+    execution_time_ms: int = Field(default=0, alias="executionTimeMs", description="执行时间（毫秒）")
    error: Optional[str] = Field(None, description="错误信息（如果失败）")
-    fallback_available: bool = Field(default=True, description="是否可降级执行")
+    fallback_available: bool = Field(default=True, alias="fallbackAvailable", description="是否可降级执行")
+    
+    model_config = {"populate_by_name": True}


 class ExtractionError(Exception):
--- a/extraction_service/forensics/validator.py
+++ b/extraction_service/forensics/validator.py
--- a/extraction_service/main.py
+++ b/extraction_service/main.py
@@ -52,9 +52,6 @@ app.add_middleware(
 TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service"))
 TEMP_DIR.mkdir(parents=True, exist_ok=True)

-# 注册 RVW V2.0 数据侦探路由
-app.include_router(forensics_router)
-
 # 导入服务模块
 from services.pdf_extractor import extract_pdf_pymupdf
 from services.pdf_processor import extract_pdf, get_pdf_processing_strategy
@@ -71,6 +68,7 @@ from services.doc_export_service import check_pandoc_available, convert_markdown

 # 新增：RVW V2.0 数据侦探模块
 from forensics.api import router as forensics_router
+app.include_router(forensics_router)

 # 兼容：nougat 相关（已废弃，保留空实现避免报错）
 def check_nougat_available(): return False