feat(ssa): Complete Phase I-IV intelligent dialogue and tool system development

Phase I - Session Blackboard + READ Layer:
- SessionBlackboardService with Postgres-Only cache
- DataProfileService for data overview generation
- PicoInferenceService for LLM-driven PICO extraction
- Frontend DataContextCard and VariableDictionaryPanel
- E2E tests: 31/31 passed

Phase II - Conversation Layer LLM + Intent Router:
- ConversationService with SSE streaming
- IntentRouterService (rule-first + LLM fallback, 6 intents)
- SystemPromptService with 6-segment dynamic assembly
- TokenTruncationService for context management
- ChatHandlerService as unified chat entry
- Frontend SSAChatPane and useSSAChat hook
- E2E tests: 38/38 passed

Phase III - Method Consultation + AskUser Standardization:
- ToolRegistryService with Repository Pattern
- MethodConsultService with DecisionTable + LLM enhancement
- AskUserService with global interrupt handling
- Frontend AskUserCard component
- E2E tests: 13/13 passed

Phase IV - Dialogue-Driven Analysis + QPER Integration:
- ToolOrchestratorService (plan/execute/report)
- analysis_plan SSE event for WorkflowPlan transmission
- Dual-channel confirmation (ask_user card + workspace button)
- PICO as optional hint for LLM parsing
- E2E tests: 25/25 passed

R Statistics Service:
- 5 new R tools: anova_one, baseline_table, fisher, linear_reg, wilcoxon
- Enhanced guardrails and block helpers
- Comprehensive test suite (run_all_tools_test.js)

Documentation:
- Updated system status document (v5.9)
- Updated SSA module status and development plan (v1.8)

Total E2E: 107/107 passed (Phase I: 31, Phase II: 38, Phase III: 13, Phase IV: 25)

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-02-22 18:53:39 +08:00
parent bf10dec4c8
commit 3446909ff7
68 changed files with 11583 additions and 412 deletions

View File

@@ -95,7 +95,7 @@ from operations.metric_time_transform import (
)
from operations.fillna import fillna_simple, fillna_mice, get_column_missing_stats
# ✨ SSA Phase 2A: 数据画像
from operations.data_profile import generate_data_profile, get_quality_score
from operations.data_profile import generate_data_profile, get_quality_score, analyze_variable_detail
# ==================== Pydantic Models ====================
@@ -248,6 +248,14 @@ class DataProfileCSVRequest(BaseModel):
include_quality_score: bool = True
class VariableDetailRequest(BaseModel):
"""单变量详情请求模型 (SSA Phase I)"""
csv_content: str
variable_name: str
max_bins: int = 30
max_qq_points: int = 200
class FillnaSimpleRequest(BaseModel):
"""简单填补请求模型"""
data: List[Dict[str, Any]]
@@ -2265,6 +2273,46 @@ async def ssa_data_profile_csv(request: DataProfileCSVRequest):
}, status_code=400)
# ==================== 单变量详情 API (Phase I) ====================
@app.post("/api/ssa/variable-detail")
async def ssa_variable_detail(request: VariableDetailRequest):
"""
单变量详细分析 (SSA Phase I)
返回指定变量的描述统计、分布直方图数据、正态性检验、Q-Q 图数据点。
直方图 bins 上限 max_bins默认 30H2 防护Q-Q 点上限 max_qq_points。
"""
try:
import pandas as pd
import time
from io import StringIO
start_time = time.time()
df = pd.read_csv(StringIO(request.csv_content))
logger.info(f"[SSA] 单变量详情分析: {request.variable_name}")
detail = analyze_variable_detail(
df, request.variable_name,
max_bins=request.max_bins,
max_qq_points=request.max_qq_points
)
detail['execution_time'] = round(time.time() - start_time, 3)
status_code = 200 if detail.get('success') else 400
return JSONResponse(content=detail, status_code=status_code)
except Exception as e:
logger.error(f"[SSA] 单变量详情分析失败: {str(e)}")
return JSONResponse(content={
"success": False,
"error": str(e)
}, status_code=400)
# ==================== Word 导出 API ====================
@app.get("/api/pandoc/status")

View File

@@ -1,12 +1,18 @@
"""
SSA DataProfile - 数据画像生成模块 (Phase 2A)
SSA DataProfile - 数据画像生成模块 (Phase 2A → Phase I)
提供数据上传时的快速画像生成,用于 LLM 生成 SAP分析计划
高性能实现,利用 pandas 的向量化操作。
Phase I 新增:
- compute_normality_tests(df) — Shapiro-Wilk / K-S 正态性检验
- compute_complete_cases(df) — 完整病例计数
- analyze_variable_detail() — 单变量详细分析(直方图+Q-Q图数据
"""
import pandas as pd
import numpy as np
from scipy import stats as scipy_stats
from typing import List, Dict, Any, Optional
from loguru import logger
@@ -55,11 +61,16 @@ def generate_data_profile(df: pd.DataFrame, max_unique_values: int = 20) -> Dict
'totalMissingCells': int(total_missing)
}
logger.info(f"数据画像生成完成: {numeric_count} 数值列, {categorical_count} 分类列")
normality_tests = compute_normality_tests(df, columns)
complete_case_count = compute_complete_cases(df)
logger.info(f"数据画像生成完成: {numeric_count} 数值列, {categorical_count} 分类列, 完整病例 {complete_case_count}")
return {
'columns': columns,
'summary': summary
'summary': summary,
'normalityTests': normality_tests,
'completeCaseCount': complete_case_count
}
@@ -317,3 +328,168 @@ def get_quality_score(profile: Dict[str, Any]) -> Dict[str, Any]:
'issues': issues,
'recommendations': recommendations
}
# ────────────────────────────────────────────
# Phase I 新增函数
# ────────────────────────────────────────────
def compute_normality_tests(df: pd.DataFrame, columns: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
对所有数值列执行正态性检验。
样本量 <= 5000 用 Shapiro-Wilk> 5000 降级为 Kolmogorov-Smirnov。
"""
results = []
numeric_cols = [c['name'] for c in columns if c['type'] == 'numeric']
for col_name in numeric_cols:
try:
col_data = pd.to_numeric(df[col_name], errors='coerce').dropna()
if len(col_data) < 3:
continue
if len(col_data) <= 5000:
stat, p_value = scipy_stats.shapiro(col_data)
method = 'shapiro_wilk'
else:
stat, p_value = scipy_stats.kstest(col_data, 'norm',
args=(col_data.mean(), col_data.std()))
method = 'kolmogorov_smirnov'
results.append({
'variable': col_name,
'method': method,
'statistic': round(float(stat), 4),
'pValue': round(float(p_value), 4),
'isNormal': bool(p_value >= 0.05)
})
except Exception as e:
logger.warning(f"正态性检验失败 [{col_name}]: {e}")
return results
def compute_complete_cases(df: pd.DataFrame) -> int:
"""返回无任何缺失值的完整病例数。"""
return int(df.dropna().shape[0])
def analyze_variable_detail(df: pd.DataFrame, variable_name: str,
max_bins: int = 30, max_qq_points: int = 200) -> Dict[str, Any]:
"""
单变量详细分析Phase I: get_variable_detail 工具后端)。
返回:描述统计 + 分布直方图数据 + 正态性检验 + Q-Q 图数据点。
直方图 bins 强制上限 max_binsH2 防护Q-Q 点上限 max_qq_points。
"""
if variable_name not in df.columns:
return {'success': False, 'error': f"变量 '{variable_name}' 不存在"}
col = df[variable_name]
non_null = col.dropna()
total = len(col)
missing = int(col.isna().sum())
unique_count = int(non_null.nunique())
col_type = infer_column_type(col, unique_count, total)
result: Dict[str, Any] = {
'success': True,
'variable': variable_name,
'type': col_type,
'totalCount': total,
'missingCount': missing,
'missingRate': round(missing / total * 100, 2) if total > 0 else 0,
'uniqueCount': unique_count,
}
if col_type == 'numeric':
col_numeric = pd.to_numeric(non_null, errors='coerce').dropna()
if len(col_numeric) == 0:
result['descriptive'] = {}
return result
q1 = float(col_numeric.quantile(0.25))
q3 = float(col_numeric.quantile(0.75))
iqr_val = q3 - q1
lower_bound = q1 - 1.5 * iqr_val
upper_bound = q3 + 1.5 * iqr_val
outliers = col_numeric[(col_numeric < lower_bound) | (col_numeric > upper_bound)]
result['descriptive'] = {
'mean': round(float(col_numeric.mean()), 4),
'std': round(float(col_numeric.std()), 4),
'median': round(float(col_numeric.median()), 4),
'min': round(float(col_numeric.min()), 4),
'max': round(float(col_numeric.max()), 4),
'q1': round(q1, 4),
'q3': round(q3, 4),
'iqr': round(iqr_val, 4),
'skewness': round(float(col_numeric.skew()), 4) if len(col_numeric) >= 3 else None,
'kurtosis': round(float(col_numeric.kurtosis()), 4) if len(col_numeric) >= 4 else None,
}
result['outliers'] = {
'count': int(len(outliers)),
'rate': round(len(outliers) / len(col_numeric) * 100, 2),
'lowerBound': round(lower_bound, 4),
'upperBound': round(upper_bound, 4),
}
n_bins = min(max_bins, unique_count)
hist_counts, hist_edges = np.histogram(col_numeric, bins=max(n_bins, 1))
result['histogram'] = {
'counts': [int(c) for c in hist_counts],
'edges': [round(float(e), 4) for e in hist_edges],
}
if len(col_numeric) >= 3:
try:
if len(col_numeric) <= 5000:
stat, p_val = scipy_stats.shapiro(col_numeric)
method = 'shapiro_wilk'
else:
stat, p_val = scipy_stats.kstest(col_numeric, 'norm',
args=(col_numeric.mean(), col_numeric.std()))
method = 'kolmogorov_smirnov'
result['normalityTest'] = {
'method': method,
'statistic': round(float(stat), 4),
'pValue': round(float(p_val), 4),
'isNormal': bool(p_val >= 0.05),
}
except Exception:
result['normalityTest'] = None
sorted_data = np.sort(col_numeric.values)
n = len(sorted_data)
if n > max_qq_points:
indices = np.linspace(0, n - 1, max_qq_points, dtype=int)
sampled = sorted_data[indices]
else:
sampled = sorted_data
theoretical = scipy_stats.norm.ppf(
np.linspace(1 / (len(sampled) + 1), len(sampled) / (len(sampled) + 1), len(sampled))
)
result['qqPlot'] = {
'theoretical': [round(float(t), 4) for t in theoretical],
'observed': [round(float(o), 4) for o in sampled],
}
elif col_type == 'categorical':
value_counts = non_null.value_counts()
total_non_null = len(non_null)
result['distribution'] = [
{
'value': str(val),
'count': int(cnt),
'percentage': round(cnt / total_non_null * 100, 2)
}
for val, cnt in value_counts.items()
]
result['descriptive'] = {
'totalLevels': int(len(value_counts)),
'modeValue': str(value_counts.index[0]) if len(value_counts) > 0 else None,
'modeCount': int(value_counts.iloc[0]) if len(value_counts) > 0 else 0,
}
return result