feat(ssa): Complete Phase I-IV intelligent dialogue and tool system development

Phase I - Session Blackboard + READ Layer: - SessionBlackboardService with Postgres-Only cache - DataProfileService for data overview generation - PicoInferenceService for LLM-driven PICO extraction - Frontend DataContextCard and VariableDictionaryPanel - E2E tests: 31/31 passed Phase II - Conversation Layer LLM + Intent Router: - ConversationService with SSE streaming - IntentRouterService (rule-first + LLM fallback, 6 intents) - SystemPromptService with 6-segment dynamic assembly - TokenTruncationService for context management - ChatHandlerService as unified chat entry - Frontend SSAChatPane and useSSAChat hook - E2E tests: 38/38 passed Phase III - Method Consultation + AskUser Standardization: - ToolRegistryService with Repository Pattern - MethodConsultService with DecisionTable + LLM enhancement - AskUserService with global interrupt handling - Frontend AskUserCard component - E2E tests: 13/13 passed Phase IV - Dialogue-Driven Analysis + QPER Integration: - ToolOrchestratorService (plan/execute/report) - analysis_plan SSE event for WorkflowPlan transmission - Dual-channel confirmation (ask_user card + workspace button) - PICO as optional hint for LLM parsing - E2E tests: 25/25 passed R Statistics Service: - 5 new R tools: anova_one, baseline_table, fisher, linear_reg, wilcoxon - Enhanced guardrails and block helpers - Comprehensive test suite (run_all_tools_test.js) Documentation: - Updated system status document (v5.9) - Updated SSA module status and development plan (v1.8) Total E2E: 107/107 passed (Phase I: 31, Phase II: 38, Phase III: 13, Phase IV: 25) Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-22 18:53:39 +08:00
parent bf10dec4c8
commit 3446909ff7
68 changed files with 11583 additions and 412 deletions
--- a/extraction_service/main.py
+++ b/extraction_service/main.py
@@ -95,7 +95,7 @@ from operations.metric_time_transform import (
 )
 from operations.fillna import fillna_simple, fillna_mice, get_column_missing_stats
 # ✨ SSA Phase 2A: 数据画像
-from operations.data_profile import generate_data_profile, get_quality_score
+from operations.data_profile import generate_data_profile, get_quality_score, analyze_variable_detail


 # ==================== Pydantic Models ====================
@@ -248,6 +248,14 @@ class DataProfileCSVRequest(BaseModel):
    include_quality_score: bool = True


+class VariableDetailRequest(BaseModel):
+    """单变量详情请求模型 (SSA Phase I)"""
+    csv_content: str
+    variable_name: str
+    max_bins: int = 30
+    max_qq_points: int = 200
+
+
 class FillnaSimpleRequest(BaseModel):
    """简单填补请求模型"""
    data: List[Dict[str, Any]]
@@ -2265,6 +2273,46 @@ async def ssa_data_profile_csv(request: DataProfileCSVRequest):
        }, status_code=400)


+# ==================== 单变量详情 API (Phase I) ====================
+
+@app.post("/api/ssa/variable-detail")
+async def ssa_variable_detail(request: VariableDetailRequest):
+    """
+    单变量详细分析 (SSA Phase I)
+    
+    返回指定变量的描述统计、分布直方图数据、正态性检验、Q-Q 图数据点。
+    直方图 bins 上限 max_bins（默认 30，H2 防护），Q-Q 点上限 max_qq_points。
+    """
+    try:
+        import pandas as pd
+        import time
+        from io import StringIO
+        
+        start_time = time.time()
+        
+        df = pd.read_csv(StringIO(request.csv_content))
+        
+        logger.info(f"[SSA] 单变量详情分析: {request.variable_name}")
+        
+        detail = analyze_variable_detail(
+            df, request.variable_name,
+            max_bins=request.max_bins,
+            max_qq_points=request.max_qq_points
+        )
+        
+        detail['execution_time'] = round(time.time() - start_time, 3)
+        
+        status_code = 200 if detail.get('success') else 400
+        return JSONResponse(content=detail, status_code=status_code)
+        
+    except Exception as e:
+        logger.error(f"[SSA] 单变量详情分析失败: {str(e)}")
+        return JSONResponse(content={
+            "success": False,
+            "error": str(e)
+        }, status_code=400)
+
+
 # ==================== Word 导出 API ====================

@app.get("/api/pandoc/status")
--- a/extraction_service/operations/data_profile.py
+++ b/extraction_service/operations/data_profile.py
@@ -1,12 +1,18 @@
 """
-SSA DataProfile - 数据画像生成模块 (Phase 2A)
+SSA DataProfile - 数据画像生成模块 (Phase 2A → Phase I)

 提供数据上传时的快速画像生成，用于 LLM 生成 SAP（分析计划）。
 高性能实现，利用 pandas 的向量化操作。
+
+Phase I 新增：
+- compute_normality_tests(df)  — Shapiro-Wilk / K-S 正态性检验
+- compute_complete_cases(df)   — 完整病例计数
+- analyze_variable_detail()    — 单变量详细分析（直方图+Q-Q图数据）
 """

 import pandas as pd
 import numpy as np
+from scipy import stats as scipy_stats
 from typing import List, Dict, Any, Optional
 from loguru import logger

@@ -55,11 +61,16 @@ def generate_data_profile(df: pd.DataFrame, max_unique_values: int = 20) -> Dict
        'totalMissingCells': int(total_missing)
    }
    
-    logger.info(f"数据画像生成完成: {numeric_count} 数值列, {categorical_count} 分类列")
+    normality_tests = compute_normality_tests(df, columns)
+    complete_case_count = compute_complete_cases(df)
+    
+    logger.info(f"数据画像生成完成: {numeric_count} 数值列, {categorical_count} 分类列, 完整病例 {complete_case_count}")
    
    return {
        'columns': columns,
-        'summary': summary
+        'summary': summary,
+        'normalityTests': normality_tests,
+        'completeCaseCount': complete_case_count
    }


@@ -317,3 +328,168 @@ def get_quality_score(profile: Dict[str, Any]) -> Dict[str, Any]:
        'issues': issues,
        'recommendations': recommendations
    }
+
+
+# ────────────────────────────────────────────
+# Phase I 新增函数
+# ────────────────────────────────────────────
+
+def compute_normality_tests(df: pd.DataFrame, columns: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    对所有数值列执行正态性检验。
+    样本量 <= 5000 用 Shapiro-Wilk，> 5000 降级为 Kolmogorov-Smirnov。
+    """
+    results = []
+    numeric_cols = [c['name'] for c in columns if c['type'] == 'numeric']
+    
+    for col_name in numeric_cols:
+        try:
+            col_data = pd.to_numeric(df[col_name], errors='coerce').dropna()
+            if len(col_data) < 3:
+                continue
+            
+            if len(col_data) <= 5000:
+                stat, p_value = scipy_stats.shapiro(col_data)
+                method = 'shapiro_wilk'
+            else:
+                stat, p_value = scipy_stats.kstest(col_data, 'norm',
+                                                    args=(col_data.mean(), col_data.std()))
+                method = 'kolmogorov_smirnov'
+            
+            results.append({
+                'variable': col_name,
+                'method': method,
+                'statistic': round(float(stat), 4),
+                'pValue': round(float(p_value), 4),
+                'isNormal': bool(p_value >= 0.05)
+            })
+        except Exception as e:
+            logger.warning(f"正态性检验失败 [{col_name}]: {e}")
+    
+    return results
+
+
+def compute_complete_cases(df: pd.DataFrame) -> int:
+    """返回无任何缺失值的完整病例数。"""
+    return int(df.dropna().shape[0])
+
+
+def analyze_variable_detail(df: pd.DataFrame, variable_name: str,
+                            max_bins: int = 30, max_qq_points: int = 200) -> Dict[str, Any]:
+    """
+    单变量详细分析（Phase I: get_variable_detail 工具后端）。
+    
+    返回：描述统计 + 分布直方图数据 + 正态性检验 + Q-Q 图数据点。
+    直方图 bins 强制上限 max_bins（H2 防护），Q-Q 点上限 max_qq_points。
+    """
+    if variable_name not in df.columns:
+        return {'success': False, 'error': f"变量 '{variable_name}' 不存在"}
+    
+    col = df[variable_name]
+    non_null = col.dropna()
+    total = len(col)
+    missing = int(col.isna().sum())
+    unique_count = int(non_null.nunique())
+    col_type = infer_column_type(col, unique_count, total)
+    
+    result: Dict[str, Any] = {
+        'success': True,
+        'variable': variable_name,
+        'type': col_type,
+        'totalCount': total,
+        'missingCount': missing,
+        'missingRate': round(missing / total * 100, 2) if total > 0 else 0,
+        'uniqueCount': unique_count,
+    }
+    
+    if col_type == 'numeric':
+        col_numeric = pd.to_numeric(non_null, errors='coerce').dropna()
+        if len(col_numeric) == 0:
+            result['descriptive'] = {}
+            return result
+        
+        q1 = float(col_numeric.quantile(0.25))
+        q3 = float(col_numeric.quantile(0.75))
+        iqr_val = q3 - q1
+        lower_bound = q1 - 1.5 * iqr_val
+        upper_bound = q3 + 1.5 * iqr_val
+        outliers = col_numeric[(col_numeric < lower_bound) | (col_numeric > upper_bound)]
+        
+        result['descriptive'] = {
+            'mean': round(float(col_numeric.mean()), 4),
+            'std': round(float(col_numeric.std()), 4),
+            'median': round(float(col_numeric.median()), 4),
+            'min': round(float(col_numeric.min()), 4),
+            'max': round(float(col_numeric.max()), 4),
+            'q1': round(q1, 4),
+            'q3': round(q3, 4),
+            'iqr': round(iqr_val, 4),
+            'skewness': round(float(col_numeric.skew()), 4) if len(col_numeric) >= 3 else None,
+            'kurtosis': round(float(col_numeric.kurtosis()), 4) if len(col_numeric) >= 4 else None,
+        }
+        
+        result['outliers'] = {
+            'count': int(len(outliers)),
+            'rate': round(len(outliers) / len(col_numeric) * 100, 2),
+            'lowerBound': round(lower_bound, 4),
+            'upperBound': round(upper_bound, 4),
+        }
+        
+        n_bins = min(max_bins, unique_count)
+        hist_counts, hist_edges = np.histogram(col_numeric, bins=max(n_bins, 1))
+        result['histogram'] = {
+            'counts': [int(c) for c in hist_counts],
+            'edges': [round(float(e), 4) for e in hist_edges],
+        }
+        
+        if len(col_numeric) >= 3:
+            try:
+                if len(col_numeric) <= 5000:
+                    stat, p_val = scipy_stats.shapiro(col_numeric)
+                    method = 'shapiro_wilk'
+                else:
+                    stat, p_val = scipy_stats.kstest(col_numeric, 'norm',
+                                                      args=(col_numeric.mean(), col_numeric.std()))
+                    method = 'kolmogorov_smirnov'
+                result['normalityTest'] = {
+                    'method': method,
+                    'statistic': round(float(stat), 4),
+                    'pValue': round(float(p_val), 4),
+                    'isNormal': bool(p_val >= 0.05),
+                }
+            except Exception:
+                result['normalityTest'] = None
+        
+        sorted_data = np.sort(col_numeric.values)
+        n = len(sorted_data)
+        if n > max_qq_points:
+            indices = np.linspace(0, n - 1, max_qq_points, dtype=int)
+            sampled = sorted_data[indices]
+        else:
+            sampled = sorted_data
+        theoretical = scipy_stats.norm.ppf(
+            np.linspace(1 / (len(sampled) + 1), len(sampled) / (len(sampled) + 1), len(sampled))
+        )
+        result['qqPlot'] = {
+            'theoretical': [round(float(t), 4) for t in theoretical],
+            'observed': [round(float(o), 4) for o in sampled],
+        }
+    
+    elif col_type == 'categorical':
+        value_counts = non_null.value_counts()
+        total_non_null = len(non_null)
+        result['distribution'] = [
+            {
+                'value': str(val),
+                'count': int(cnt),
+                'percentage': round(cnt / total_non_null * 100, 2)
+            }
+            for val, cnt in value_counts.items()
+        ]
+        result['descriptive'] = {
+            'totalLevels': int(len(value_counts)),
+            'modeValue': str(value_counts.index[0]) if len(value_counts) > 0 else None,
+            'modeCount': int(value_counts.iloc[0]) if len(value_counts) > 0 else 0,
+        }
+    
+    return result