feat(ssa): Complete Phase I-IV intelligent dialogue and tool system development
Phase I - Session Blackboard + READ Layer: - SessionBlackboardService with Postgres-Only cache - DataProfileService for data overview generation - PicoInferenceService for LLM-driven PICO extraction - Frontend DataContextCard and VariableDictionaryPanel - E2E tests: 31/31 passed Phase II - Conversation Layer LLM + Intent Router: - ConversationService with SSE streaming - IntentRouterService (rule-first + LLM fallback, 6 intents) - SystemPromptService with 6-segment dynamic assembly - TokenTruncationService for context management - ChatHandlerService as unified chat entry - Frontend SSAChatPane and useSSAChat hook - E2E tests: 38/38 passed Phase III - Method Consultation + AskUser Standardization: - ToolRegistryService with Repository Pattern - MethodConsultService with DecisionTable + LLM enhancement - AskUserService with global interrupt handling - Frontend AskUserCard component - E2E tests: 13/13 passed Phase IV - Dialogue-Driven Analysis + QPER Integration: - ToolOrchestratorService (plan/execute/report) - analysis_plan SSE event for WorkflowPlan transmission - Dual-channel confirmation (ask_user card + workspace button) - PICO as optional hint for LLM parsing - E2E tests: 25/25 passed R Statistics Service: - 5 new R tools: anova_one, baseline_table, fisher, linear_reg, wilcoxon - Enhanced guardrails and block helpers - Comprehensive test suite (run_all_tools_test.js) Documentation: - Updated system status document (v5.9) - Updated SSA module status and development plan (v1.8) Total E2E: 107/107 passed (Phase I: 31, Phase II: 38, Phase III: 13, Phase IV: 25) Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -95,7 +95,7 @@ from operations.metric_time_transform import (
|
||||
)
|
||||
from operations.fillna import fillna_simple, fillna_mice, get_column_missing_stats
|
||||
# ✨ SSA Phase 2A: 数据画像
|
||||
from operations.data_profile import generate_data_profile, get_quality_score
|
||||
from operations.data_profile import generate_data_profile, get_quality_score, analyze_variable_detail
|
||||
|
||||
|
||||
# ==================== Pydantic Models ====================
|
||||
@@ -248,6 +248,14 @@ class DataProfileCSVRequest(BaseModel):
|
||||
include_quality_score: bool = True
|
||||
|
||||
|
||||
class VariableDetailRequest(BaseModel):
|
||||
"""单变量详情请求模型 (SSA Phase I)"""
|
||||
csv_content: str
|
||||
variable_name: str
|
||||
max_bins: int = 30
|
||||
max_qq_points: int = 200
|
||||
|
||||
|
||||
class FillnaSimpleRequest(BaseModel):
|
||||
"""简单填补请求模型"""
|
||||
data: List[Dict[str, Any]]
|
||||
@@ -2265,6 +2273,46 @@ async def ssa_data_profile_csv(request: DataProfileCSVRequest):
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
# ==================== 单变量详情 API (Phase I) ====================
|
||||
|
||||
@app.post("/api/ssa/variable-detail")
|
||||
async def ssa_variable_detail(request: VariableDetailRequest):
|
||||
"""
|
||||
单变量详细分析 (SSA Phase I)
|
||||
|
||||
返回指定变量的描述统计、分布直方图数据、正态性检验、Q-Q 图数据点。
|
||||
直方图 bins 上限 max_bins(默认 30,H2 防护),Q-Q 点上限 max_qq_points。
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import time
|
||||
from io import StringIO
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
df = pd.read_csv(StringIO(request.csv_content))
|
||||
|
||||
logger.info(f"[SSA] 单变量详情分析: {request.variable_name}")
|
||||
|
||||
detail = analyze_variable_detail(
|
||||
df, request.variable_name,
|
||||
max_bins=request.max_bins,
|
||||
max_qq_points=request.max_qq_points
|
||||
)
|
||||
|
||||
detail['execution_time'] = round(time.time() - start_time, 3)
|
||||
|
||||
status_code = 200 if detail.get('success') else 400
|
||||
return JSONResponse(content=detail, status_code=status_code)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[SSA] 单变量详情分析失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
# ==================== Word 导出 API ====================
|
||||
|
||||
@app.get("/api/pandoc/status")
|
||||
|
||||
@@ -1,12 +1,18 @@
|
||||
"""
|
||||
SSA DataProfile - 数据画像生成模块 (Phase 2A)
|
||||
SSA DataProfile - 数据画像生成模块 (Phase 2A → Phase I)
|
||||
|
||||
提供数据上传时的快速画像生成,用于 LLM 生成 SAP(分析计划)。
|
||||
高性能实现,利用 pandas 的向量化操作。
|
||||
|
||||
Phase I 新增:
|
||||
- compute_normality_tests(df) — Shapiro-Wilk / K-S 正态性检验
|
||||
- compute_complete_cases(df) — 完整病例计数
|
||||
- analyze_variable_detail() — 单变量详细分析(直方图+Q-Q图数据)
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy import stats as scipy_stats
|
||||
from typing import List, Dict, Any, Optional
|
||||
from loguru import logger
|
||||
|
||||
@@ -55,11 +61,16 @@ def generate_data_profile(df: pd.DataFrame, max_unique_values: int = 20) -> Dict
|
||||
'totalMissingCells': int(total_missing)
|
||||
}
|
||||
|
||||
logger.info(f"数据画像生成完成: {numeric_count} 数值列, {categorical_count} 分类列")
|
||||
normality_tests = compute_normality_tests(df, columns)
|
||||
complete_case_count = compute_complete_cases(df)
|
||||
|
||||
logger.info(f"数据画像生成完成: {numeric_count} 数值列, {categorical_count} 分类列, 完整病例 {complete_case_count}")
|
||||
|
||||
return {
|
||||
'columns': columns,
|
||||
'summary': summary
|
||||
'summary': summary,
|
||||
'normalityTests': normality_tests,
|
||||
'completeCaseCount': complete_case_count
|
||||
}
|
||||
|
||||
|
||||
@@ -317,3 +328,168 @@ def get_quality_score(profile: Dict[str, Any]) -> Dict[str, Any]:
|
||||
'issues': issues,
|
||||
'recommendations': recommendations
|
||||
}
|
||||
|
||||
|
||||
# ────────────────────────────────────────────
|
||||
# Phase I 新增函数
|
||||
# ────────────────────────────────────────────
|
||||
|
||||
def compute_normality_tests(df: pd.DataFrame, columns: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
对所有数值列执行正态性检验。
|
||||
样本量 <= 5000 用 Shapiro-Wilk,> 5000 降级为 Kolmogorov-Smirnov。
|
||||
"""
|
||||
results = []
|
||||
numeric_cols = [c['name'] for c in columns if c['type'] == 'numeric']
|
||||
|
||||
for col_name in numeric_cols:
|
||||
try:
|
||||
col_data = pd.to_numeric(df[col_name], errors='coerce').dropna()
|
||||
if len(col_data) < 3:
|
||||
continue
|
||||
|
||||
if len(col_data) <= 5000:
|
||||
stat, p_value = scipy_stats.shapiro(col_data)
|
||||
method = 'shapiro_wilk'
|
||||
else:
|
||||
stat, p_value = scipy_stats.kstest(col_data, 'norm',
|
||||
args=(col_data.mean(), col_data.std()))
|
||||
method = 'kolmogorov_smirnov'
|
||||
|
||||
results.append({
|
||||
'variable': col_name,
|
||||
'method': method,
|
||||
'statistic': round(float(stat), 4),
|
||||
'pValue': round(float(p_value), 4),
|
||||
'isNormal': bool(p_value >= 0.05)
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"正态性检验失败 [{col_name}]: {e}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def compute_complete_cases(df: pd.DataFrame) -> int:
|
||||
"""返回无任何缺失值的完整病例数。"""
|
||||
return int(df.dropna().shape[0])
|
||||
|
||||
|
||||
def analyze_variable_detail(df: pd.DataFrame, variable_name: str,
|
||||
max_bins: int = 30, max_qq_points: int = 200) -> Dict[str, Any]:
|
||||
"""
|
||||
单变量详细分析(Phase I: get_variable_detail 工具后端)。
|
||||
|
||||
返回:描述统计 + 分布直方图数据 + 正态性检验 + Q-Q 图数据点。
|
||||
直方图 bins 强制上限 max_bins(H2 防护),Q-Q 点上限 max_qq_points。
|
||||
"""
|
||||
if variable_name not in df.columns:
|
||||
return {'success': False, 'error': f"变量 '{variable_name}' 不存在"}
|
||||
|
||||
col = df[variable_name]
|
||||
non_null = col.dropna()
|
||||
total = len(col)
|
||||
missing = int(col.isna().sum())
|
||||
unique_count = int(non_null.nunique())
|
||||
col_type = infer_column_type(col, unique_count, total)
|
||||
|
||||
result: Dict[str, Any] = {
|
||||
'success': True,
|
||||
'variable': variable_name,
|
||||
'type': col_type,
|
||||
'totalCount': total,
|
||||
'missingCount': missing,
|
||||
'missingRate': round(missing / total * 100, 2) if total > 0 else 0,
|
||||
'uniqueCount': unique_count,
|
||||
}
|
||||
|
||||
if col_type == 'numeric':
|
||||
col_numeric = pd.to_numeric(non_null, errors='coerce').dropna()
|
||||
if len(col_numeric) == 0:
|
||||
result['descriptive'] = {}
|
||||
return result
|
||||
|
||||
q1 = float(col_numeric.quantile(0.25))
|
||||
q3 = float(col_numeric.quantile(0.75))
|
||||
iqr_val = q3 - q1
|
||||
lower_bound = q1 - 1.5 * iqr_val
|
||||
upper_bound = q3 + 1.5 * iqr_val
|
||||
outliers = col_numeric[(col_numeric < lower_bound) | (col_numeric > upper_bound)]
|
||||
|
||||
result['descriptive'] = {
|
||||
'mean': round(float(col_numeric.mean()), 4),
|
||||
'std': round(float(col_numeric.std()), 4),
|
||||
'median': round(float(col_numeric.median()), 4),
|
||||
'min': round(float(col_numeric.min()), 4),
|
||||
'max': round(float(col_numeric.max()), 4),
|
||||
'q1': round(q1, 4),
|
||||
'q3': round(q3, 4),
|
||||
'iqr': round(iqr_val, 4),
|
||||
'skewness': round(float(col_numeric.skew()), 4) if len(col_numeric) >= 3 else None,
|
||||
'kurtosis': round(float(col_numeric.kurtosis()), 4) if len(col_numeric) >= 4 else None,
|
||||
}
|
||||
|
||||
result['outliers'] = {
|
||||
'count': int(len(outliers)),
|
||||
'rate': round(len(outliers) / len(col_numeric) * 100, 2),
|
||||
'lowerBound': round(lower_bound, 4),
|
||||
'upperBound': round(upper_bound, 4),
|
||||
}
|
||||
|
||||
n_bins = min(max_bins, unique_count)
|
||||
hist_counts, hist_edges = np.histogram(col_numeric, bins=max(n_bins, 1))
|
||||
result['histogram'] = {
|
||||
'counts': [int(c) for c in hist_counts],
|
||||
'edges': [round(float(e), 4) for e in hist_edges],
|
||||
}
|
||||
|
||||
if len(col_numeric) >= 3:
|
||||
try:
|
||||
if len(col_numeric) <= 5000:
|
||||
stat, p_val = scipy_stats.shapiro(col_numeric)
|
||||
method = 'shapiro_wilk'
|
||||
else:
|
||||
stat, p_val = scipy_stats.kstest(col_numeric, 'norm',
|
||||
args=(col_numeric.mean(), col_numeric.std()))
|
||||
method = 'kolmogorov_smirnov'
|
||||
result['normalityTest'] = {
|
||||
'method': method,
|
||||
'statistic': round(float(stat), 4),
|
||||
'pValue': round(float(p_val), 4),
|
||||
'isNormal': bool(p_val >= 0.05),
|
||||
}
|
||||
except Exception:
|
||||
result['normalityTest'] = None
|
||||
|
||||
sorted_data = np.sort(col_numeric.values)
|
||||
n = len(sorted_data)
|
||||
if n > max_qq_points:
|
||||
indices = np.linspace(0, n - 1, max_qq_points, dtype=int)
|
||||
sampled = sorted_data[indices]
|
||||
else:
|
||||
sampled = sorted_data
|
||||
theoretical = scipy_stats.norm.ppf(
|
||||
np.linspace(1 / (len(sampled) + 1), len(sampled) / (len(sampled) + 1), len(sampled))
|
||||
)
|
||||
result['qqPlot'] = {
|
||||
'theoretical': [round(float(t), 4) for t in theoretical],
|
||||
'observed': [round(float(o), 4) for o in sampled],
|
||||
}
|
||||
|
||||
elif col_type == 'categorical':
|
||||
value_counts = non_null.value_counts()
|
||||
total_non_null = len(non_null)
|
||||
result['distribution'] = [
|
||||
{
|
||||
'value': str(val),
|
||||
'count': int(cnt),
|
||||
'percentage': round(cnt / total_non_null * 100, 2)
|
||||
}
|
||||
for val, cnt in value_counts.items()
|
||||
]
|
||||
result['descriptive'] = {
|
||||
'totalLevels': int(len(value_counts)),
|
||||
'modeValue': str(value_counts.index[0]) if len(value_counts) > 0 else None,
|
||||
'modeCount': int(value_counts.iloc[0]) if len(value_counts) > 0 else 0,
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user