feat(rvw): Complete V2.0 Week 3 - Statistical validation extension and UX improvements
Week 3 Development Summary: - Implement negative sign normalization (6 Unicode variants) - Enhance T-test validation with smart sample size extraction - Enhance SE triangle and CI-P consistency validation with subrow support - Add precise sub-cell highlighting for P-values in multi-line cells - Add frontend issue type Chinese translations (6 new types) - Add file format tips for PDF/DOC uploads Technical improvements: - Add _clean_statistical_text() in extractor.py - Add _safe_float() wrapper in validator.py - Add ForensicsReport.tsx component - Update ISSUE_TYPE_LABELS translations Documentation: - Add 2026-02-18 development record - Update RVW module status (v5.1) - Update system status (v5.2) Status: Week 3 complete, ready for Week 4 testing Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -173,7 +173,7 @@ async def analyze_docx(
|
||||
f"耗时: {execution_time_ms}ms"
|
||||
)
|
||||
|
||||
return JSONResponse(content=result.model_dump())
|
||||
return JSONResponse(content=result.model_dump(by_alias=True))
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
|
||||
@@ -44,6 +44,12 @@ EFFECT_SIZE_PATTERN = re.compile(
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
# 卡方值匹配,如 "χ²=57.519" 或 "2=57.519" 或 "χ2=57.519"
|
||||
CHI_SQUARE_PATTERN = re.compile(
|
||||
r"(?:χ[²2]|[χx]2|2)\s*[=:]\s*(\d+\.?\d*)",
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
|
||||
# ==================== 统计方法检测 ====================
|
||||
|
||||
|
||||
@@ -225,8 +225,8 @@ class DocxTableExtractor:
|
||||
if col_idx >= num_cols:
|
||||
break
|
||||
|
||||
# 获取单元格文本
|
||||
cell_text = self._get_cell_text(cell)
|
||||
# 获取单元格文本(保留换行符用于 HTML 显示)
|
||||
cell_text = self._get_cell_text(cell, use_newline=True)
|
||||
|
||||
# 检测合并范围
|
||||
# python-docx 中合并单元格会重复出现同一个 cell 对象
|
||||
@@ -253,13 +253,123 @@ class DocxTableExtractor:
|
||||
|
||||
return data
|
||||
|
||||
def _get_cell_text(self, cell: _Cell) -> str:
|
||||
# Symbol 字体字符映射表(Word 使用 Symbol 字体表示希腊字母等)
|
||||
SYMBOL_CHAR_MAP = {
|
||||
'F063': 'χ', # chi
|
||||
'F032': '²', # superscript 2
|
||||
'F061': 'α', # alpha
|
||||
'F062': 'β', # beta
|
||||
'F067': 'γ', # gamma
|
||||
'F064': 'δ', # delta
|
||||
'F065': 'ε', # epsilon
|
||||
'F06D': 'μ', # mu
|
||||
'F073': 'σ', # sigma
|
||||
'F070': 'π', # pi
|
||||
'F0B2': '²', # another superscript 2 encoding
|
||||
}
|
||||
|
||||
def _clean_statistical_text(self, text: str) -> str:
|
||||
"""
|
||||
清洗统计学文本中的特殊字符
|
||||
|
||||
关键清洗:
|
||||
1. 负号归一化(最重要!防止 float() 崩溃)
|
||||
2. 比较符归一化
|
||||
3. 零宽字符清理
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# 1. 负号归一化(极高危!)
|
||||
# Word 会自动把连字符转成破折号或数学减号,导致 float() 报错
|
||||
text = text.replace('\u2212', '-') # 数学减号 (Minus Sign)
|
||||
text = text.replace('\u2013', '-') # En Dash
|
||||
text = text.replace('\u2014', '-') # Em Dash
|
||||
text = text.replace('\u2010', '-') # Hyphen
|
||||
text = text.replace('\u2011', '-') # Non-Breaking Hyphen
|
||||
text = text.replace('\u00ad', '-') # Soft Hyphen
|
||||
|
||||
# 2. 比较符归一化
|
||||
text = text.replace('\u2264', '<=') # ≤
|
||||
text = text.replace('\u2265', '>=') # ≥
|
||||
text = text.replace('\u2260', '!=') # ≠
|
||||
text = text.replace('\u2248', '~=') # ≈
|
||||
|
||||
# 3. 加减号归一化
|
||||
# 保留 ± 原样,因为它在统计学中有特定含义(如 mean±SD)
|
||||
# text = text.replace('\u00b1', '+/-') # ±
|
||||
|
||||
# 4. 乘号归一化
|
||||
text = text.replace('\u00d7', 'x') # ×
|
||||
text = text.replace('\u2217', '*') # ∗ (asterisk operator)
|
||||
|
||||
# 5. 零宽字符清理
|
||||
text = text.replace('\u200b', '') # Zero-Width Space
|
||||
text = text.replace('\u200c', '') # Zero-Width Non-Joiner
|
||||
text = text.replace('\u200d', '') # Zero-Width Joiner
|
||||
text = text.replace('\ufeff', '') # BOM / Zero-Width No-Break Space
|
||||
text = text.replace('\u00a0', ' ') # Non-Breaking Space -> 普通空格
|
||||
|
||||
return text
|
||||
|
||||
def _get_cell_text(self, cell: _Cell, use_newline: bool = False) -> str:
|
||||
"""
|
||||
获取单元格文本(合并多个段落)
|
||||
|
||||
Args:
|
||||
cell: Word 单元格对象
|
||||
use_newline: 是否使用换行符连接段落(用于 HTML 显示)
|
||||
|
||||
注意:会处理 Word 的 <w:sym> 符号字符(如 χ² 等)
|
||||
"""
|
||||
paragraphs = cell.paragraphs
|
||||
texts = [p.text.strip() for p in paragraphs]
|
||||
return " ".join(texts).strip()
|
||||
texts = []
|
||||
|
||||
for para in paragraphs:
|
||||
# 使用增强的文本提取(处理符号字符)
|
||||
para_text = self._extract_paragraph_text(para)
|
||||
if para_text.strip():
|
||||
texts.append(para_text.strip())
|
||||
|
||||
separator = "\n" if use_newline else " "
|
||||
raw_text = separator.join(texts).strip()
|
||||
|
||||
# 清洗统计学特殊字符(负号归一化等)
|
||||
return self._clean_statistical_text(raw_text)
|
||||
|
||||
def _extract_paragraph_text(self, para: Paragraph) -> str:
|
||||
"""
|
||||
从段落中提取完整文本,包括 <w:sym> 符号字符
|
||||
|
||||
Word 使用 <w:sym w:font="Symbol" w:char="F063"/> 表示 χ 等符号,
|
||||
python-docx 的 paragraph.text 不会提取这些内容。
|
||||
"""
|
||||
from docx.oxml.ns import qn
|
||||
|
||||
text_parts = []
|
||||
|
||||
# 遍历段落中的所有 run 元素
|
||||
for run in para._p.iter():
|
||||
# 处理普通文本
|
||||
if run.tag == qn('w:t'):
|
||||
text_parts.append(run.text or '')
|
||||
|
||||
# 处理符号字符 <w:sym>
|
||||
elif run.tag == qn('w:sym'):
|
||||
font = run.get(qn('w:font'))
|
||||
char_code = run.get(qn('w:char'))
|
||||
|
||||
if font == 'Symbol' and char_code:
|
||||
# 查找映射
|
||||
unicode_char = self.SYMBOL_CHAR_MAP.get(char_code.upper(), '')
|
||||
if unicode_char:
|
||||
text_parts.append(unicode_char)
|
||||
else:
|
||||
# 未知符号,记录警告
|
||||
logger.debug(f"Unknown Symbol char: {char_code}")
|
||||
text_parts.append(f'[SYM:{char_code}]')
|
||||
|
||||
return ''.join(text_parts)
|
||||
|
||||
def _generate_html(
|
||||
self,
|
||||
@@ -296,8 +406,10 @@ class DocxTableExtractor:
|
||||
html_parts.append(" <tr>")
|
||||
for col_idx, cell in enumerate(row, start=1):
|
||||
coord = f"R{row_idx}C{col_idx}"
|
||||
# 为每个子行添加 span 标记,支持细粒度高亮
|
||||
cell_html = self._escape_html_with_subrows(cell, coord)
|
||||
html_parts.append(
|
||||
f' <td data-coord="{coord}">{self._escape_html(cell)}</td>'
|
||||
f' <td data-coord="{coord}">{cell_html}</td>'
|
||||
)
|
||||
html_parts.append(" </tr>")
|
||||
html_parts.append(" </tbody>")
|
||||
@@ -307,7 +419,43 @@ class DocxTableExtractor:
|
||||
return "\n".join(html_parts)
|
||||
|
||||
def _escape_html(self, text: str) -> str:
|
||||
"""转义 HTML 特殊字符"""
|
||||
"""转义 HTML 特殊字符,并将换行符转换为 <br>"""
|
||||
escaped = (
|
||||
text
|
||||
.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
.replace('"', """)
|
||||
.replace("'", "'")
|
||||
)
|
||||
# 将换行符转换为 <br> 标签,保留表格中的多行结构
|
||||
return escaped.replace("\n", "<br>")
|
||||
|
||||
def _escape_html_with_subrows(self, text: str, coord: str) -> str:
|
||||
"""
|
||||
转义 HTML 并为每个子行添加 span 标记,支持细粒度高亮
|
||||
|
||||
例如:单元格内容 "0.017\n0.01\n<0.001" 会生成:
|
||||
<span data-subcoord="R5C5S1">0.017</span><br>
|
||||
<span data-subcoord="R5C5S2">0.01</span><br>
|
||||
<span data-subcoord="R5C5S3"><0.001</span>
|
||||
"""
|
||||
lines = text.split("\n")
|
||||
if len(lines) == 1:
|
||||
# 单行内容,直接转义
|
||||
return self._escape_single(text)
|
||||
|
||||
# 多行内容,为每行添加 span
|
||||
result_parts = []
|
||||
for idx, line in enumerate(lines, start=1):
|
||||
escaped_line = self._escape_single(line)
|
||||
subcoord = f"{coord}S{idx}"
|
||||
result_parts.append(f'<span data-subcoord="{subcoord}">{escaped_line}</span>')
|
||||
|
||||
return "<br>".join(result_parts)
|
||||
|
||||
def _escape_single(self, text: str) -> str:
|
||||
"""转义单行文本的 HTML 特殊字符"""
|
||||
return (
|
||||
text
|
||||
.replace("&", "&")
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
定义所有数据结构,确保类型安全和接口一致性。
|
||||
"""
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from pydantic import BaseModel, Field, model_serializer
|
||||
from typing import List, Dict, Any, Optional
|
||||
from enum import Enum
|
||||
|
||||
@@ -59,15 +59,30 @@ class ForensicsConfig(BaseModel):
|
||||
|
||||
|
||||
class CellLocation(BaseModel):
|
||||
"""单元格位置(R1C1 坐标)"""
|
||||
table_id: str = Field(..., description="表格 ID,如 tbl_0")
|
||||
"""单元格位置(R1C1 坐标),支持单元格内子行定位"""
|
||||
table_id: str = Field(..., alias="tableId", description="表格 ID,如 tbl_0")
|
||||
row: int = Field(..., description="行号,从 1 开始")
|
||||
col: int = Field(..., description="列号,从 1 开始")
|
||||
subrow: Optional[int] = Field(None, description="单元格内子行号,从 1 开始(用于多行单元格)")
|
||||
|
||||
@property
|
||||
def cell_ref(self) -> str:
|
||||
"""返回 R1C1 格式的坐标"""
|
||||
return f"R{self.row}C{self.col}"
|
||||
model_config = {"populate_by_name": True}
|
||||
|
||||
@model_serializer
|
||||
def serialize(self) -> Dict[str, Any]:
|
||||
"""序列化时自动添加 cellRef 字段,支持子行坐标"""
|
||||
# 基础坐标:R{row}C{col}
|
||||
# 子行坐标:R{row}C{col}S{subrow}
|
||||
cell_ref = f"R{self.row}C{self.col}"
|
||||
if self.subrow is not None:
|
||||
cell_ref += f"S{self.subrow}"
|
||||
|
||||
return {
|
||||
"tableId": self.table_id,
|
||||
"row": self.row,
|
||||
"col": self.col,
|
||||
"subrow": self.subrow,
|
||||
"cellRef": cell_ref
|
||||
}
|
||||
|
||||
|
||||
class Issue(BaseModel):
|
||||
@@ -84,26 +99,30 @@ class TableData(BaseModel):
|
||||
id: str = Field(..., description="表格 ID,如 tbl_0")
|
||||
caption: Optional[str] = Field(None, description="表格标题")
|
||||
type: Optional[str] = Field(None, description="表格类型:BASELINE/OUTCOME/OTHER")
|
||||
row_count: int = Field(..., description="行数")
|
||||
col_count: int = Field(..., description="列数")
|
||||
row_count: int = Field(..., alias="rowCount", description="行数")
|
||||
col_count: int = Field(..., alias="colCount", description="列数")
|
||||
html: str = Field(..., description="预渲染的 HTML 片段")
|
||||
data: List[List[str]] = Field(..., description="二维数组数据")
|
||||
issues: List[Issue] = Field(default_factory=list, description="该表格的问题列表")
|
||||
skipped: bool = Field(default=False, description="是否被跳过(超限)")
|
||||
skip_reason: Optional[str] = Field(None, description="跳过原因")
|
||||
skip_reason: Optional[str] = Field(None, alias="skipReason", description="跳过原因")
|
||||
|
||||
model_config = {"populate_by_name": True}
|
||||
|
||||
|
||||
class ForensicsResult(BaseModel):
|
||||
"""数据侦探分析结果"""
|
||||
success: bool = Field(..., description="是否成功")
|
||||
methods_found: List[str] = Field(default_factory=list, description="检测到的统计方法")
|
||||
methods_found: List[str] = Field(default_factory=list, alias="methodsFound", description="检测到的统计方法")
|
||||
tables: List[TableData] = Field(default_factory=list, description="表格列表")
|
||||
total_issues: int = Field(default=0, description="总问题数")
|
||||
error_count: int = Field(default=0, description="ERROR 级别问题数")
|
||||
warning_count: int = Field(default=0, description="WARNING 级别问题数")
|
||||
execution_time_ms: int = Field(default=0, description="执行时间(毫秒)")
|
||||
total_issues: int = Field(default=0, alias="totalIssues", description="总问题数")
|
||||
error_count: int = Field(default=0, alias="errorCount", description="ERROR 级别问题数")
|
||||
warning_count: int = Field(default=0, alias="warningCount", description="WARNING 级别问题数")
|
||||
execution_time_ms: int = Field(default=0, alias="executionTimeMs", description="执行时间(毫秒)")
|
||||
error: Optional[str] = Field(None, description="错误信息(如果失败)")
|
||||
fallback_available: bool = Field(default=True, description="是否可降级执行")
|
||||
fallback_available: bool = Field(default=True, alias="fallbackAvailable", description="是否可降级执行")
|
||||
|
||||
model_config = {"populate_by_name": True}
|
||||
|
||||
|
||||
class ExtractionError(Exception):
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -52,9 +52,6 @@ app.add_middleware(
|
||||
TEMP_DIR = Path(os.getenv("TEMP_DIR", "/tmp/extraction_service"))
|
||||
TEMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 注册 RVW V2.0 数据侦探路由
|
||||
app.include_router(forensics_router)
|
||||
|
||||
# 导入服务模块
|
||||
from services.pdf_extractor import extract_pdf_pymupdf
|
||||
from services.pdf_processor import extract_pdf, get_pdf_processing_strategy
|
||||
@@ -71,6 +68,7 @@ from services.doc_export_service import check_pandoc_available, convert_markdown
|
||||
|
||||
# 新增:RVW V2.0 数据侦探模块
|
||||
from forensics.api import router as forensics_router
|
||||
app.include_router(forensics_router)
|
||||
|
||||
# 兼容:nougat 相关(已废弃,保留空实现避免报错)
|
||||
def check_nougat_available(): return False
|
||||
|
||||
Reference in New Issue
Block a user