feat(ssa): Complete QPER architecture - Query, Planner, Execute, Reflection layers

Implement the full QPER intelligent analysis pipeline: - Phase E+: Block-based standardization for all 7 R tools, DynamicReport renderer, Word export enhancement - Phase Q: LLM intent parsing with dynamic Zod validation against real column names, ClarificationCard component, DataProfile is_id_like tagging - Phase P: ConfigLoader with Zod schema validation and hot-reload API, DecisionTableService (4-dimension matching), FlowTemplateService with EPV protection, PlannedTrace audit output - Phase R: ReflectionService with statistical slot injection, sensitivity analysis conflict rules, ConclusionReport with section reveal animation, conclusion caching API, graceful R error classification End-to-end test: 40/40 passed across two complete analysis scenarios. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-21 18:15:53 +08:00
parent 428a22adf2
commit 371e1c069c
73 changed files with 9242 additions and 706 deletions
--- a/extraction_service/operations/data_profile.py
+++ b/extraction_service/operations/data_profile.py
@@ -99,9 +99,35 @@ def analyze_column(col: pd.Series, col_name: str, max_unique_values: int = 20) -
    elif col_type == 'datetime':
        profile.update(analyze_datetime_column(non_null))
    
+    profile['isIdLike'] = _detect_id_like(col_name, col_type, unique_count, total_count)
+    
    return profile


+import re
+
+_ID_PATTERNS = re.compile(
+    r'(_id|_no|_code|编号|序号|流水号|主键|record_date|visit_date|enroll_date)$|^(id|ID|Id)_|^(patient|subject|sample|record)_?id$',
+    re.IGNORECASE
+)
+
+
+def _detect_id_like(col_name: str, col_type: str, unique_count: int, total_count: int) -> bool:
+    """
+    判断列是否为非分析变量（ID / 高基数字符串 / 日期）
+    标记为 True 后，Q 层 Context Pruning 会在注入 Prompt 前物理剔除这些列
+    """
+    if col_type == 'datetime':
+        return True
+    if _ID_PATTERNS.search(col_name):
+        return True
+    if col_type == 'text' and total_count > 0 and unique_count / total_count > 0.95:
+        return True
+    if col_type == 'categorical' and total_count > 0 and unique_count / total_count > 0.95:
+        return True
+    return False
+
+
 def infer_column_type(col: pd.Series, unique_count: int, total_count: int) -> str:
    """
    推断列的数据类型