feat(ssa): Complete Phase 2A frontend integration - multi-step workflow end-to-end
Phase 2A: WorkflowPlannerService, WorkflowExecutorService, Python data quality, 6 bug fixes, DescriptiveResultView, multi-step R code/Word export, MVP UI reuse. V11 UI: Gemini-style, multi-task, single-page scroll, Word export. Architecture: Block-based rendering consensus (4 block types). New R tools: chi_square, correlation, descriptive, logistic_binary, mann_whitney, t_test_paired. Docs: dev summary, block-based plan, status updates, task list v2.0. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -94,6 +94,8 @@ from operations.metric_time_transform import (
|
||||
preview_multi_metric_to_matrix # ✨ 多指标转换预览(方向2)
|
||||
)
|
||||
from operations.fillna import fillna_simple, fillna_mice, get_column_missing_stats
|
||||
# ✨ SSA Phase 2A: 数据画像
|
||||
from operations.data_profile import generate_data_profile, get_quality_score
|
||||
|
||||
|
||||
# ==================== Pydantic Models ====================
|
||||
@@ -231,6 +233,21 @@ class FillnaStatsRequest(BaseModel):
|
||||
column: str
|
||||
|
||||
|
||||
# ✨ SSA Phase 2A: DataProfile 请求模型
|
||||
class DataProfileRequest(BaseModel):
|
||||
"""数据画像请求模型 (SSA Phase 2A)"""
|
||||
data: List[Dict[str, Any]]
|
||||
max_unique_values: int = 20 # 分类变量显示的最大唯一值数量
|
||||
include_quality_score: bool = True # 是否包含质量评分
|
||||
|
||||
|
||||
class DataProfileCSVRequest(BaseModel):
|
||||
"""数据画像请求模型 - CSV 直传 (SSA Phase 2A)"""
|
||||
csv_content: str # CSV 文件内容(字符串)
|
||||
max_unique_values: int = 20
|
||||
include_quality_score: bool = True
|
||||
|
||||
|
||||
class FillnaSimpleRequest(BaseModel):
|
||||
"""简单填补请求模型"""
|
||||
data: List[Dict[str, Any]]
|
||||
@@ -2125,6 +2142,129 @@ async def operation_fillna_mice(request: FillnaMiceRequest):
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
# ==================== SSA Phase 2A: DataProfile API ====================
|
||||
|
||||
@app.post("/api/ssa/data-profile")
|
||||
async def ssa_data_profile(request: DataProfileRequest):
|
||||
"""
|
||||
生成数据画像 (SSA Phase 2A)
|
||||
|
||||
用于 SSA 模块在用户上传数据时快速生成数据画像,
|
||||
画像将喂给 LLM 以生成分析计划 (SAP)。
|
||||
|
||||
Args:
|
||||
request: DataProfileRequest
|
||||
- data: 数据 (JSON 格式)
|
||||
- max_unique_values: 分类变量显示的最大唯一值数量
|
||||
- include_quality_score: 是否包含质量评分
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"profile": {
|
||||
"columns": [...],
|
||||
"summary": {...}
|
||||
},
|
||||
"quality": {...} (可选),
|
||||
"execution_time": float
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import time
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
df = pd.DataFrame(request.data)
|
||||
|
||||
logger.info(f"[SSA] 开始生成数据画像: {df.shape}")
|
||||
|
||||
profile = generate_data_profile(df, request.max_unique_values)
|
||||
|
||||
result = {
|
||||
"success": True,
|
||||
"profile": profile
|
||||
}
|
||||
|
||||
if request.include_quality_score:
|
||||
result["quality"] = get_quality_score(profile)
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
result["execution_time"] = round(execution_time, 3)
|
||||
|
||||
logger.info(f"[SSA] 数据画像生成完成: {execution_time:.3f}s")
|
||||
|
||||
return JSONResponse(content=result)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[SSA] 数据画像生成失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
@app.post("/api/ssa/data-profile-csv")
|
||||
async def ssa_data_profile_csv(request: DataProfileCSVRequest):
|
||||
"""
|
||||
生成数据画像 - CSV 直传 (SSA Phase 2A)
|
||||
|
||||
直接接收 CSV 字符串,由 Python pandas 解析,
|
||||
比 Node.js 解析后再转 JSON 更高效、更可靠。
|
||||
|
||||
Args:
|
||||
request: DataProfileCSVRequest
|
||||
- csv_content: CSV 文件内容(字符串)
|
||||
- max_unique_values: 分类变量显示的最大唯一值数量
|
||||
- include_quality_score: 是否包含质量评分
|
||||
|
||||
Returns:
|
||||
{
|
||||
"success": bool,
|
||||
"profile": {...},
|
||||
"quality": {...} (可选),
|
||||
"execution_time": float
|
||||
}
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
import time
|
||||
from io import StringIO
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# pandas 直接解析 CSV 字符串,自动推断类型
|
||||
df = pd.read_csv(StringIO(request.csv_content))
|
||||
|
||||
logger.info(f"[SSA] CSV 解析完成,开始生成数据画像: {df.shape}")
|
||||
|
||||
profile = generate_data_profile(df, request.max_unique_values)
|
||||
|
||||
result = {
|
||||
"success": True,
|
||||
"profile": profile
|
||||
}
|
||||
|
||||
if request.include_quality_score:
|
||||
result["quality"] = get_quality_score(profile)
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
result["execution_time"] = round(execution_time, 3)
|
||||
|
||||
logger.info(f"[SSA] 数据画像生成完成 (CSV): {execution_time:.3f}s")
|
||||
|
||||
return JSONResponse(content=result)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[SSA] CSV 数据画像生成失败: {str(e)}")
|
||||
return JSONResponse(content={
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"execution_time": time.time() - start_time if 'start_time' in locals() else 0
|
||||
}, status_code=400)
|
||||
|
||||
|
||||
# ==================== Word 导出 API ====================
|
||||
|
||||
@app.get("/api/pandoc/status")
|
||||
|
||||
293
extraction_service/operations/data_profile.py
Normal file
293
extraction_service/operations/data_profile.py
Normal file
@@ -0,0 +1,293 @@
|
||||
"""
|
||||
SSA DataProfile - 数据画像生成模块 (Phase 2A)
|
||||
|
||||
提供数据上传时的快速画像生成,用于 LLM 生成 SAP(分析计划)。
|
||||
高性能实现,利用 pandas 的向量化操作。
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import List, Dict, Any, Optional
|
||||
from loguru import logger
|
||||
|
||||
|
||||
def generate_data_profile(df: pd.DataFrame, max_unique_values: int = 20) -> Dict[str, Any]:
|
||||
"""
|
||||
生成数据画像(DataProfile)
|
||||
|
||||
Args:
|
||||
df: 输入数据框
|
||||
max_unique_values: 分类变量显示的最大唯一值数量
|
||||
|
||||
Returns:
|
||||
DataProfile JSON 结构
|
||||
"""
|
||||
logger.info(f"开始生成数据画像: {df.shape[0]} 行, {df.shape[1]} 列")
|
||||
|
||||
columns = []
|
||||
numeric_count = 0
|
||||
categorical_count = 0
|
||||
datetime_count = 0
|
||||
|
||||
for col_name in df.columns:
|
||||
col = df[col_name]
|
||||
col_profile = analyze_column(col, col_name, max_unique_values)
|
||||
columns.append(col_profile)
|
||||
|
||||
if col_profile['type'] == 'numeric':
|
||||
numeric_count += 1
|
||||
elif col_profile['type'] == 'categorical':
|
||||
categorical_count += 1
|
||||
elif col_profile['type'] == 'datetime':
|
||||
datetime_count += 1
|
||||
|
||||
total_cells = df.shape[0] * df.shape[1]
|
||||
total_missing = df.isna().sum().sum()
|
||||
|
||||
summary = {
|
||||
'totalRows': int(df.shape[0]),
|
||||
'totalColumns': int(df.shape[1]),
|
||||
'numericColumns': numeric_count,
|
||||
'categoricalColumns': categorical_count,
|
||||
'datetimeColumns': datetime_count,
|
||||
'textColumns': int(df.shape[1]) - numeric_count - categorical_count - datetime_count,
|
||||
'overallMissingRate': round(total_missing / total_cells * 100, 2) if total_cells > 0 else 0,
|
||||
'totalMissingCells': int(total_missing)
|
||||
}
|
||||
|
||||
logger.info(f"数据画像生成完成: {numeric_count} 数值列, {categorical_count} 分类列")
|
||||
|
||||
return {
|
||||
'columns': columns,
|
||||
'summary': summary
|
||||
}
|
||||
|
||||
|
||||
def analyze_column(col: pd.Series, col_name: str, max_unique_values: int = 20) -> Dict[str, Any]:
|
||||
"""
|
||||
分析单个列的统计特征
|
||||
|
||||
Args:
|
||||
col: 列数据
|
||||
col_name: 列名
|
||||
max_unique_values: 显示的最大唯一值数量
|
||||
|
||||
Returns:
|
||||
列画像
|
||||
"""
|
||||
non_null = col.dropna()
|
||||
missing_count = int(col.isna().sum())
|
||||
total_count = len(col)
|
||||
missing_rate = round(missing_count / total_count * 100, 2) if total_count > 0 else 0
|
||||
unique_count = int(non_null.nunique())
|
||||
|
||||
col_type = infer_column_type(col, unique_count, total_count)
|
||||
|
||||
profile = {
|
||||
'name': col_name,
|
||||
'type': col_type,
|
||||
'missingCount': missing_count,
|
||||
'missingRate': missing_rate,
|
||||
'uniqueCount': unique_count,
|
||||
'totalCount': total_count
|
||||
}
|
||||
|
||||
if col_type == 'numeric':
|
||||
profile.update(analyze_numeric_column(non_null))
|
||||
elif col_type == 'categorical':
|
||||
profile.update(analyze_categorical_column(non_null, max_unique_values))
|
||||
elif col_type == 'datetime':
|
||||
profile.update(analyze_datetime_column(non_null))
|
||||
|
||||
return profile
|
||||
|
||||
|
||||
def infer_column_type(col: pd.Series, unique_count: int, total_count: int) -> str:
|
||||
"""
|
||||
推断列的数据类型
|
||||
|
||||
Returns:
|
||||
'numeric' | 'categorical' | 'datetime' | 'text'
|
||||
"""
|
||||
if pd.api.types.is_datetime64_any_dtype(col):
|
||||
return 'datetime'
|
||||
|
||||
if pd.api.types.is_numeric_dtype(col):
|
||||
unique_ratio = unique_count / total_count if total_count > 0 else 0
|
||||
if unique_count <= 10 and unique_ratio < 0.05:
|
||||
return 'categorical'
|
||||
return 'numeric'
|
||||
|
||||
if col.dtype == 'object' or col.dtype == 'string':
|
||||
non_null = col.dropna()
|
||||
if len(non_null) == 0:
|
||||
return 'text'
|
||||
|
||||
unique_ratio = unique_count / total_count if total_count > 0 else 0
|
||||
if unique_count <= 30 and unique_ratio < 0.1:
|
||||
return 'categorical'
|
||||
|
||||
try:
|
||||
pd.to_numeric(non_null, errors='raise')
|
||||
return 'numeric'
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
pd.to_datetime(non_null, errors='raise')
|
||||
return 'datetime'
|
||||
except:
|
||||
pass
|
||||
|
||||
return 'text'
|
||||
|
||||
return 'text'
|
||||
|
||||
|
||||
def analyze_numeric_column(col: pd.Series) -> Dict[str, Any]:
|
||||
"""
|
||||
分析数值列的统计特征
|
||||
"""
|
||||
if len(col) == 0:
|
||||
return {}
|
||||
|
||||
col_numeric = pd.to_numeric(col, errors='coerce').dropna()
|
||||
|
||||
if len(col_numeric) == 0:
|
||||
return {}
|
||||
|
||||
q1 = float(col_numeric.quantile(0.25))
|
||||
q3 = float(col_numeric.quantile(0.75))
|
||||
iqr = q3 - q1
|
||||
lower_bound = q1 - 1.5 * iqr
|
||||
upper_bound = q3 + 1.5 * iqr
|
||||
outlier_count = int(((col_numeric < lower_bound) | (col_numeric > upper_bound)).sum())
|
||||
|
||||
return {
|
||||
'mean': round(float(col_numeric.mean()), 4),
|
||||
'std': round(float(col_numeric.std()), 4),
|
||||
'median': round(float(col_numeric.median()), 4),
|
||||
'min': round(float(col_numeric.min()), 4),
|
||||
'max': round(float(col_numeric.max()), 4),
|
||||
'q1': round(q1, 4),
|
||||
'q3': round(q3, 4),
|
||||
'iqr': round(iqr, 4),
|
||||
'outlierCount': outlier_count,
|
||||
'outlierRate': round(outlier_count / len(col_numeric) * 100, 2) if len(col_numeric) > 0 else 0,
|
||||
'skewness': round(float(col_numeric.skew()), 4) if len(col_numeric) >= 3 else None,
|
||||
'kurtosis': round(float(col_numeric.kurtosis()), 4) if len(col_numeric) >= 4 else None
|
||||
}
|
||||
|
||||
|
||||
def analyze_categorical_column(col: pd.Series, max_values: int = 20) -> Dict[str, Any]:
|
||||
"""
|
||||
分析分类列的统计特征
|
||||
"""
|
||||
if len(col) == 0:
|
||||
return {}
|
||||
|
||||
value_counts = col.value_counts()
|
||||
total = len(col)
|
||||
|
||||
top_values = []
|
||||
for value, count in value_counts.head(max_values).items():
|
||||
top_values.append({
|
||||
'value': str(value),
|
||||
'count': int(count),
|
||||
'percentage': round(count / total * 100, 2)
|
||||
})
|
||||
|
||||
return {
|
||||
'topValues': top_values,
|
||||
'totalLevels': int(len(value_counts)),
|
||||
'modeValue': str(value_counts.index[0]) if len(value_counts) > 0 else None,
|
||||
'modeCount': int(value_counts.iloc[0]) if len(value_counts) > 0 else 0
|
||||
}
|
||||
|
||||
|
||||
def analyze_datetime_column(col: pd.Series) -> Dict[str, Any]:
|
||||
"""
|
||||
分析日期时间列的统计特征
|
||||
"""
|
||||
if len(col) == 0:
|
||||
return {}
|
||||
|
||||
try:
|
||||
col_dt = pd.to_datetime(col, errors='coerce').dropna()
|
||||
|
||||
if len(col_dt) == 0:
|
||||
return {}
|
||||
|
||||
return {
|
||||
'minDate': col_dt.min().isoformat(),
|
||||
'maxDate': col_dt.max().isoformat(),
|
||||
'dateRange': str(col_dt.max() - col_dt.min())
|
||||
}
|
||||
except:
|
||||
return {}
|
||||
|
||||
|
||||
def get_quality_score(profile: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
计算数据质量评分
|
||||
|
||||
Returns:
|
||||
质量评分和建议
|
||||
"""
|
||||
summary = profile.get('summary', {})
|
||||
columns = profile.get('columns', [])
|
||||
|
||||
score = 100.0
|
||||
issues = []
|
||||
recommendations = []
|
||||
|
||||
overall_missing_rate = summary.get('overallMissingRate', 0)
|
||||
if overall_missing_rate > 20:
|
||||
score -= 30
|
||||
issues.append(f"整体缺失率较高 ({overall_missing_rate}%)")
|
||||
recommendations.append("建议检查数据完整性,考虑缺失值处理")
|
||||
elif overall_missing_rate > 10:
|
||||
score -= 15
|
||||
issues.append(f"整体缺失率中等 ({overall_missing_rate}%)")
|
||||
recommendations.append("建议在分析前处理缺失值")
|
||||
elif overall_missing_rate > 5:
|
||||
score -= 5
|
||||
issues.append(f"存在少量缺失 ({overall_missing_rate}%)")
|
||||
|
||||
for col in columns:
|
||||
if col.get('outlierRate', 0) > 10:
|
||||
score -= 5
|
||||
issues.append(f"列 '{col['name']}' 存在较多异常值 ({col['outlierRate']}%)")
|
||||
recommendations.append(f"建议检查列 '{col['name']}' 的异常值")
|
||||
|
||||
total_rows = summary.get('totalRows', 0)
|
||||
if total_rows < 30:
|
||||
score -= 20
|
||||
issues.append(f"样本量较小 (n={total_rows})")
|
||||
recommendations.append("小样本可能影响统计检验的效力")
|
||||
elif total_rows < 100:
|
||||
score -= 10
|
||||
issues.append(f"样本量中等 (n={total_rows})")
|
||||
|
||||
score = max(0, min(100, score))
|
||||
|
||||
if score >= 80:
|
||||
grade = 'A'
|
||||
grade_desc = '数据质量良好'
|
||||
elif score >= 60:
|
||||
grade = 'B'
|
||||
grade_desc = '数据质量中等'
|
||||
elif score >= 40:
|
||||
grade = 'C'
|
||||
grade_desc = '数据质量较差'
|
||||
else:
|
||||
grade = 'D'
|
||||
grade_desc = '数据质量很差'
|
||||
|
||||
return {
|
||||
'score': round(score, 1),
|
||||
'grade': grade,
|
||||
'gradeDescription': grade_desc,
|
||||
'issues': issues,
|
||||
'recommendations': recommendations
|
||||
}
|
||||
Reference in New Issue
Block a user