feat(ssa): Complete Phase 2A frontend integration - multi-step workflow end-to-end

Phase 2A: WorkflowPlannerService, WorkflowExecutorService, Python data quality, 6 bug fixes, DescriptiveResultView, multi-step R code/Word export, MVP UI reuse. V11 UI: Gemini-style, multi-task, single-page scroll, Word export. Architecture: Block-based rendering consensus (4 block types). New R tools: chi_square, correlation, descriptive, logistic_binary, mann_whitney, t_test_paired. Docs: dev summary, block-based plan, status updates, task list v2.0.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-02-20 23:09:27 +08:00
parent 23b422f758
commit 428a22adf2
62 changed files with 15416 additions and 299 deletions

View File

@@ -0,0 +1,254 @@
#' @tool_code ST_CHI_SQUARE
#' @name 卡方检验
#' @version 1.0.0
#' @description 两个分类变量的独立性检验
#' @author SSA-Pro Team
library(glue)
library(ggplot2)
library(base64enc)
run_analysis <- function(input) {
# ===== 初始化 =====
logs <- c()
log_add <- function(msg) { logs <<- c(logs, paste0("[", Sys.time(), "] ", msg)) }
on.exit({}, add = TRUE)
# ===== 数据加载 =====
log_add("开始加载输入数据")
df <- tryCatch(
load_input_data(input),
error = function(e) {
log_add(paste("数据加载失败:", e$message))
return(NULL)
}
)
if (is.null(df)) {
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "数据加载失败"))
}
log_add(glue("数据加载成功: {nrow(df)} 行, {ncol(df)} 列"))
p <- input$params
var1 <- p$var1
var2 <- p$var2
# ===== 参数校验 =====
if (!(var1 %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = var1))
}
if (!(var2 %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = var2))
}
# ===== 数据清洗 =====
original_rows <- nrow(df)
df <- df[!is.na(df[[var1]]) & trimws(as.character(df[[var1]])) != "", ]
df <- df[!is.na(df[[var2]]) & trimws(as.character(df[[var2]])) != "", ]
removed_rows <- original_rows - nrow(df)
if (removed_rows > 0) {
log_add(glue("数据清洗: 移除 {removed_rows} 行缺失值 (剩余 {nrow(df)} 行)"))
}
# ===== 护栏检查 =====
guardrail_results <- list()
warnings_list <- c()
# 样本量检查
sample_check <- check_sample_size(nrow(df), min_required = 10, action = ACTION_BLOCK)
guardrail_results <- c(guardrail_results, list(sample_check))
log_add(glue("样本量检查: N = {nrow(df)}, {sample_check$reason}"))
guardrail_status <- run_guardrail_chain(guardrail_results)
if (guardrail_status$status == "blocked") {
return(list(
status = "blocked",
message = guardrail_status$reason,
trace_log = logs
))
}
# ===== 构建列联表 =====
contingency_table <- table(df[[var1]], df[[var2]])
log_add(glue("列联表维度: {nrow(contingency_table)} x {ncol(contingency_table)}"))
# 检查列联表有效性
if (nrow(contingency_table) < 2 || ncol(contingency_table) < 2) {
return(make_error(ERROR_CODES$E003_INSUFFICIENT_GROUPS,
col = paste(var1, "或", var2),
expected = 2,
actual = min(nrow(contingency_table), ncol(contingency_table))))
}
# ===== 期望频数检查(决定使用卡方还是 Fisher =====
expected <- chisq.test(contingency_table)$expected
low_expected_count <- sum(expected < 5)
total_cells <- length(expected)
low_expected_pct <- low_expected_count / total_cells
use_fisher <- FALSE
is_2x2 <- nrow(contingency_table) == 2 && ncol(contingency_table) == 2
if (low_expected_pct > 0.2) {
warnings_list <- c(warnings_list, glue("期望频数 < 5 的格子占 {round(low_expected_pct * 100, 1)}%"))
if (is_2x2) {
use_fisher <- TRUE
log_add("2x2 表且期望频数不足,自动切换为 Fisher 精确检验")
} else {
log_add("期望频数不足,但非 2x2 表,继续使用卡方检验(结果需谨慎解读)")
}
}
# ===== 核心计算 =====
if (use_fisher) {
log_add("执行 Fisher 精确检验")
result <- fisher.test(contingency_table)
method_used <- "Fisher's Exact Test"
output_results <- list(
method = method_used,
p_value = jsonlite::unbox(as.numeric(result$p.value)),
p_value_fmt = format_p_value(result$p.value),
odds_ratio = if (!is.null(result$estimate)) jsonlite::unbox(as.numeric(result$estimate)) else NULL,
conf_int = if (!is.null(result$conf.int)) as.numeric(result$conf.int) else NULL
)
} else {
log_add("执行 Pearson 卡方检验")
result <- chisq.test(contingency_table, correct = is_2x2) # 2x2表使用Yates连续性校正
method_used <- if (is_2x2) "Pearson's Chi-squared test with Yates' continuity correction" else "Pearson's Chi-squared test"
# 计算 Cramér's V
n <- sum(contingency_table)
k <- min(nrow(contingency_table), ncol(contingency_table))
cramers_v <- sqrt(result$statistic / (n * (k - 1)))
# 效应量解释
v_interpretation <- if (cramers_v < 0.1) "微小" else if (cramers_v < 0.3) "小" else if (cramers_v < 0.5) "中等" else "大"
log_add(glue("χ² = {round(result$statistic, 3)}, df = {result$parameter}, p = {round(result$p.value, 4)}, Cramér's V = {round(cramers_v, 3)}"))
output_results <- list(
method = method_used,
statistic = jsonlite::unbox(as.numeric(result$statistic)),
df = jsonlite::unbox(as.numeric(result$parameter)),
p_value = jsonlite::unbox(as.numeric(result$p.value)),
p_value_fmt = format_p_value(result$p.value),
effect_size = list(
cramers_v = jsonlite::unbox(round(as.numeric(cramers_v), 4)),
interpretation = v_interpretation
)
)
}
# 添加列联表信息(精简版,不含原始数据)
# 将 table 转为纯数值矩阵以便 JSON 序列化
observed_matrix <- matrix(
as.numeric(contingency_table),
nrow = nrow(contingency_table),
ncol = ncol(contingency_table),
dimnames = list(rownames(contingency_table), colnames(contingency_table))
)
output_results$contingency_table <- list(
row_var = var1,
col_var = var2,
row_levels = as.character(rownames(contingency_table)),
col_levels = as.character(colnames(contingency_table)),
observed = observed_matrix,
row_totals = as.numeric(rowSums(contingency_table)),
col_totals = as.numeric(colSums(contingency_table)),
grand_total = jsonlite::unbox(sum(contingency_table))
)
# ===== 生成图表 =====
log_add("生成马赛克图")
plot_base64 <- tryCatch({
generate_mosaic_plot(contingency_table, var1, var2)
}, error = function(e) {
log_add(paste("图表生成失败:", e$message))
NULL
})
# ===== 生成可复现代码 =====
original_filename <- if (!is.null(input$original_filename) && nchar(input$original_filename) > 0) {
input$original_filename
} else {
"data.csv"
}
reproducible_code <- glue('
# SSA-Pro 自动生成代码
# 工具: 卡方检验
# 时间: {Sys.time()}
# ================================
library(ggplot2)
# 数据准备
df <- read.csv("{original_filename}")
var1 <- "{var1}"
var2 <- "{var2}"
# 数据清洗
df <- df[!is.na(df[[var1]]) & !is.na(df[[var2]]), ]
# 构建列联表
contingency_table <- table(df[[var1]], df[[var2]])
print(contingency_table)
# 卡方检验
result <- chisq.test(contingency_table)
print(result)
# 计算 Cramer V (效应量)
n <- sum(contingency_table)
k <- min(nrow(contingency_table), ncol(contingency_table))
cramers_v <- sqrt(result$statistic / (n * (k - 1)))
cat("Cramer V =", round(cramers_v, 3), "\\n")
# 可视化(马赛克图)
mosaicplot(contingency_table, main = "Mosaic Plot", color = TRUE)
')
# ===== 返回结果 =====
log_add("分析完成")
return(list(
status = "success",
message = "分析完成",
warnings = if (length(warnings_list) > 0) warnings_list else NULL,
results = output_results,
plots = if (!is.null(plot_base64)) list(plot_base64) else list(),
trace_log = logs,
reproducible_code = as.character(reproducible_code)
))
}
# 辅助函数:生成马赛克图(使用 ggplot2 模拟)
generate_mosaic_plot <- function(contingency_table, var1, var2) {
# 转换为长格式数据
df_plot <- as.data.frame(contingency_table)
names(df_plot) <- c("Var1", "Var2", "Freq")
p <- ggplot(df_plot, aes(x = Var1, y = Freq, fill = Var2)) +
geom_bar(stat = "identity", position = "fill") +
scale_y_continuous(labels = scales::percent) +
theme_minimal() +
labs(
title = paste("Association between", var1, "and", var2),
x = var1,
y = "Proportion",
fill = var2
) +
scale_fill_brewer(palette = "Set2")
tmp_file <- tempfile(fileext = ".png")
ggsave(tmp_file, p, width = 7, height = 5, dpi = 100)
base64_str <- base64encode(tmp_file)
unlink(tmp_file)
return(paste0("data:image/png;base64,", base64_str))
}

View File

@@ -0,0 +1,242 @@
#' @tool_code ST_CORRELATION
#' @name 相关分析
#' @version 1.0.0
#' @description Pearson/Spearman 相关系数计算
#' @author SSA-Pro Team
library(glue)
library(ggplot2)
library(base64enc)
run_analysis <- function(input) {
# ===== 初始化 =====
logs <- c()
log_add <- function(msg) { logs <<- c(logs, paste0("[", Sys.time(), "] ", msg)) }
on.exit({}, add = TRUE)
# ===== 数据加载 =====
log_add("开始加载输入数据")
df <- tryCatch(
load_input_data(input),
error = function(e) {
log_add(paste("数据加载失败:", e$message))
return(NULL)
}
)
if (is.null(df)) {
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "数据加载失败"))
}
log_add(glue("数据加载成功: {nrow(df)} 行, {ncol(df)} 列"))
p <- input$params
guardrails_cfg <- input$guardrails
var_x <- p$var_x
var_y <- p$var_y
method <- tolower(p$method %||% "auto") # pearson, spearman, auto
# ===== 参数校验 =====
if (!(var_x %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = var_x))
}
if (!(var_y %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = var_y))
}
# ===== 数据清洗 =====
original_rows <- nrow(df)
df <- df[!is.na(df[[var_x]]) & !is.na(df[[var_y]]), ]
removed_rows <- original_rows - nrow(df)
if (removed_rows > 0) {
log_add(glue("数据清洗: 移除 {removed_rows} 行缺失值 (剩余 {nrow(df)} 行)"))
}
x_vals <- df[[var_x]]
y_vals <- df[[var_y]]
n <- length(x_vals)
# ===== 护栏检查 =====
guardrail_results <- list()
warnings_list <- c()
# 样本量检查
sample_check <- check_sample_size(n, min_required = 10, action = ACTION_WARN)
guardrail_results <- c(guardrail_results, list(sample_check))
log_add(glue("样本量检查: N = {n}, {sample_check$reason}"))
guardrail_status <- run_guardrail_chain(guardrail_results)
if (guardrail_status$status == "blocked") {
return(list(
status = "blocked",
message = guardrail_status$reason,
trace_log = logs
))
}
if (length(guardrail_status$warnings) > 0) {
warnings_list <- c(warnings_list, guardrail_status$warnings)
}
# ===== 方法选择 =====
final_method <- method
if (method == "auto") {
log_add("自动选择相关方法(检验正态性)")
# 检验两个变量的正态性
norm_x <- check_normality(x_vals, alpha = 0.05)
norm_y <- check_normality(y_vals, alpha = 0.05)
log_add(glue("{var_x} 正态性: p = {round(norm_x$p_value, 4)}, {norm_x$reason}"))
log_add(glue("{var_y} 正态性: p = {round(norm_y$p_value, 4)}, {norm_y$reason}"))
if (norm_x$passed && norm_y$passed) {
final_method <- "pearson"
log_add("两变量均满足正态性,使用 Pearson 相关")
} else {
final_method <- "spearman"
log_add("存在非正态变量,使用 Spearman 秩相关")
warnings_list <- c(warnings_list, "变量不满足正态性,自动切换为 Spearman 秩相关")
}
}
# ===== 核心计算 =====
log_add(glue("执行 {final_method} 相关分析"))
result <- cor.test(x_vals, y_vals, method = final_method)
r_value <- result$estimate
p_value <- result$p.value
# 相关系数解释
r_abs <- abs(r_value)
r_interpretation <- if (r_abs < 0.1) "可忽略" else if (r_abs < 0.3) "弱" else if (r_abs < 0.5) "中等" else if (r_abs < 0.7) "较强" else "强"
log_add(glue("r = {round(r_value, 4)}, p = {round(p_value, 4)}, 相关强度: {r_interpretation}"))
# ===== 生成图表 =====
log_add("生成散点图")
plot_base64 <- tryCatch({
generate_scatter_plot(df, var_x, var_y, r_value, p_value, final_method)
}, error = function(e) {
log_add(paste("图表生成失败:", e$message))
NULL
})
# ===== 生成可复现代码 =====
original_filename <- if (!is.null(input$original_filename) && nchar(input$original_filename) > 0) {
input$original_filename
} else {
"data.csv"
}
reproducible_code <- glue('
# SSA-Pro 自动生成代码
# 工具: 相关分析
# 时间: {Sys.time()}
# ================================
library(ggplot2)
# 数据准备
df <- read.csv("{original_filename}")
var_x <- "{var_x}"
var_y <- "{var_y}"
# 数据清洗
df <- df[!is.na(df[[var_x]]) & !is.na(df[[var_y]]), ]
# {final_method} 相关分析
result <- cor.test(df[[var_x]], df[[var_y]], method = "{final_method}")
print(result)
# 可视化
ggplot(df, aes(x = .data[[var_x]], y = .data[[var_y]])) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE, color = "#3b82f6") +
theme_minimal() +
labs(title = paste("Correlation:", var_x, "vs", var_y),
subtitle = paste("r =", round(result$estimate, 3), ", p =", round(result$p.value, 4)))
')
# ===== 返回结果 =====
log_add("分析完成")
output_results <- list(
method = if (final_method == "pearson") "Pearson product-moment correlation" else "Spearman's rank correlation rho",
method_code = final_method,
statistic = jsonlite::unbox(round(as.numeric(r_value), 4)),
p_value = jsonlite::unbox(as.numeric(p_value)),
p_value_fmt = format_p_value(p_value),
interpretation = r_interpretation,
n = n,
variables = list(x = var_x, y = var_y),
descriptive = list(
x = list(
variable = var_x,
mean = round(mean(x_vals), 3),
sd = round(sd(x_vals), 3),
median = round(median(x_vals), 3)
),
y = list(
variable = var_y,
mean = round(mean(y_vals), 3),
sd = round(sd(y_vals), 3),
median = round(median(y_vals), 3)
)
)
)
# Pearson 相关有置信区间
if (final_method == "pearson" && !is.null(result$conf.int)) {
output_results$conf_int <- as.numeric(result$conf.int)
}
return(list(
status = "success",
message = "分析完成",
warnings = if (length(warnings_list) > 0) warnings_list else NULL,
results = output_results,
plots = if (!is.null(plot_base64)) list(plot_base64) else list(),
trace_log = logs,
reproducible_code = as.character(reproducible_code)
))
}
# 辅助函数:生成散点图
generate_scatter_plot <- function(df, var_x, var_y, r_value, p_value, method) {
# 限制点数防止图表过大
if (nrow(df) > 1000) {
set.seed(42)
df <- df[sample(nrow(df), 1000), ]
}
p <- ggplot(df, aes(x = .data[[var_x]], y = .data[[var_y]])) +
geom_point(alpha = 0.5, color = "#64748b", size = 2) +
geom_smooth(method = "lm", se = TRUE, color = "#3b82f6", fill = "#93c5fd") +
theme_minimal() +
labs(
title = paste("Correlation:", var_x, "vs", var_y),
subtitle = paste0(
ifelse(method == "pearson", "Pearson ", "Spearman "),
"r = ", round(r_value, 3),
", p ", format_p_value(p_value)
),
x = var_x,
y = var_y
)
tmp_file <- tempfile(fileext = ".png")
ggsave(tmp_file, p, width = 6, height = 5, dpi = 100)
base64_str <- base64encode(tmp_file)
unlink(tmp_file)
return(paste0("data:image/png;base64,", base64_str))
}
# 辅助:空值合并运算符
`%||%` <- function(a, b) if (is.null(a)) b else a

View File

@@ -0,0 +1,332 @@
#' @tool_code ST_DESCRIPTIVE
#' @name 描述性统计
#' @version 1.0.0
#' @description 数据概况与基线特征表
#' @author SSA-Pro Team
library(glue)
library(ggplot2)
library(base64enc)
run_analysis <- function(input) {
# ===== 初始化 =====
logs <- c()
log_add <- function(msg) { logs <<- c(logs, paste0("[", Sys.time(), "] ", msg)) }
on.exit({}, add = TRUE)
# ===== 数据加载 =====
log_add("开始加载输入数据")
df <- tryCatch(
load_input_data(input),
error = function(e) {
log_add(paste("数据加载失败:", e$message))
return(NULL)
}
)
if (is.null(df)) {
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "数据加载失败"))
}
log_add(glue("数据加载成功: {nrow(df)} 行, {ncol(df)} 列"))
p <- input$params
variables <- p$variables # 变量列表(可选,空则分析全部)
group_var <- p$group_var # 分组变量(可选)
# ===== 确定要分析的变量 =====
if (is.null(variables) || length(variables) == 0) {
variables <- names(df)
log_add("未指定变量,分析全部列")
}
# 排除分组变量本身
if (!is.null(group_var) && group_var %in% variables) {
variables <- setdiff(variables, group_var)
}
# 校验变量存在性
missing_vars <- setdiff(variables, names(df))
if (length(missing_vars) > 0) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND,
col = paste(missing_vars, collapse = ", ")))
}
# 校验分组变量
groups <- NULL
if (!is.null(group_var) && group_var != "") {
if (!(group_var %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = group_var))
}
groups <- unique(df[[group_var]][!is.na(df[[group_var]])])
log_add(glue("分组变量: {group_var}, 分组: {paste(groups, collapse=', ')}"))
}
# ===== 变量类型推断 =====
var_types <- sapply(variables, function(v) {
vals <- df[[v]]
if (is.numeric(vals)) {
non_na_count <- sum(!is.na(vals))
if (non_na_count == 0) {
return("categorical") # 全是 NA当作分类变量
}
unique_count <- length(unique(vals[!is.na(vals)]))
unique_ratio <- unique_count / non_na_count
if (unique_ratio < 0.05 && unique_count <= 10) {
return("categorical")
}
return("numeric")
} else {
return("categorical")
}
})
log_add(glue("数值变量: {sum(var_types == 'numeric')}, 分类变量: {sum(var_types == 'categorical')}"))
# ===== 计算描述性统计 =====
warnings_list <- c()
results_list <- list()
for (v in variables) {
var_type <- as.character(var_types[v])
if (is.na(var_type) || length(var_type) == 0) {
var_type <- "categorical" # 默认为分类变量
}
if (is.null(groups)) {
# 无分组
if (identical(var_type, "numeric")) {
stats <- calc_numeric_stats(df[[v]], v)
} else {
stats <- calc_categorical_stats(df[[v]], v)
}
stats$type <- var_type
results_list[[v]] <- stats
} else {
# 有分组
group_stats <- list()
for (g in groups) {
subset_vals <- df[df[[group_var]] == g, v, drop = TRUE]
if (identical(var_type, "numeric")) {
group_stats[[as.character(g)]] <- calc_numeric_stats(subset_vals, v)
} else {
group_stats[[as.character(g)]] <- calc_categorical_stats(subset_vals, v)
}
}
results_list[[v]] <- list(
variable = v,
type = var_type,
by_group = group_stats
)
}
}
# ===== 总体概况 =====
summary_stats <- list(
n_total = nrow(df),
n_variables = length(variables),
n_numeric = sum(var_types == "numeric"),
n_categorical = sum(var_types == "categorical")
)
if (!is.null(groups)) {
summary_stats$group_var <- group_var
summary_stats$groups <- lapply(groups, function(g) {
list(name = as.character(g), n = sum(df[[group_var]] == g, na.rm = TRUE))
})
}
# ===== 生成图表 =====
log_add("生成描述性统计图表")
plots <- list()
# 只为前几个变量生成图表(避免过多)
vars_to_plot <- head(variables, 4)
for (v in vars_to_plot) {
plot_base64 <- tryCatch({
if (var_types[v] == "numeric") {
generate_histogram(df, v, group_var)
} else {
generate_bar_chart(df, v, group_var)
}
}, error = function(e) {
log_add(paste("图表生成失败:", v, e$message))
NULL
})
if (!is.null(plot_base64)) {
plots <- c(plots, list(plot_base64))
}
}
# ===== 生成可复现代码 =====
original_filename <- if (!is.null(input$original_filename) && nchar(input$original_filename) > 0) {
input$original_filename
} else {
"data.csv"
}
reproducible_code <- glue('
# SSA-Pro 自动生成代码
# 工具: 描述性统计
# 时间: {Sys.time()}
# ================================
library(ggplot2)
# 数据准备
df <- read.csv("{original_filename}")
# 数值变量描述性统计
numeric_vars <- sapply(df, is.numeric)
if (any(numeric_vars)) {{
summary(df[, numeric_vars, drop = FALSE])
}}
# 分类变量频数表
categorical_vars <- !numeric_vars
if (any(categorical_vars)) {{
for (v in names(df)[categorical_vars]) {{
cat("\\n变量:", v, "\\n")
print(table(df[[v]], useNA = "ifany"))
}}
}}
# 可视化示例
# ggplot(df, aes(x = your_variable)) + geom_histogram()
')
# ===== 返回结果 =====
log_add("分析完成")
return(list(
status = "success",
message = "分析完成",
warnings = if (length(warnings_list) > 0) warnings_list else NULL,
results = list(
summary = summary_stats,
variables = results_list
),
plots = plots,
trace_log = logs,
reproducible_code = as.character(reproducible_code)
))
}
# ===== 辅助函数 =====
# 数值变量统计
calc_numeric_stats <- function(vals, var_name) {
vals <- vals[!is.na(vals)]
n <- length(vals)
if (n == 0) {
return(list(
variable = var_name,
n = 0,
missing = length(vals) - n,
stats = NULL
))
}
list(
variable = var_name,
n = n,
missing = 0,
mean = round(mean(vals), 3),
sd = round(sd(vals), 3),
median = round(median(vals), 3),
q1 = round(quantile(vals, 0.25), 3),
q3 = round(quantile(vals, 0.75), 3),
iqr = round(IQR(vals), 3),
min = round(min(vals), 3),
max = round(max(vals), 3),
skewness = round(calc_skewness(vals), 3),
formatted = paste0(round(mean(vals), 2), " ± ", round(sd(vals), 2))
)
}
# 分类变量统计
calc_categorical_stats <- function(vals, var_name) {
total <- length(vals)
valid <- sum(!is.na(vals))
freq_table <- table(vals, useNA = "no")
levels_list <- lapply(names(freq_table), function(level) {
count <- as.numeric(freq_table[level])
pct <- round(count / valid * 100, 1)
list(
level = level,
n = count,
pct = pct,
formatted = paste0(count, " (", pct, "%)")
)
})
list(
variable = var_name,
n = valid,
missing = total - valid,
levels = levels_list
)
}
# 计算偏度
calc_skewness <- function(x) {
n <- length(x)
if (n < 3) return(NA)
m <- mean(x)
s <- sd(x)
sum((x - m)^3) / (n * s^3)
}
# 生成直方图
generate_histogram <- function(df, var_name, group_var = NULL) {
if (!is.null(group_var) && group_var != "") {
p <- ggplot(df[!is.na(df[[var_name]]), ], aes(x = .data[[var_name]], fill = factor(.data[[group_var]]))) +
geom_histogram(alpha = 0.6, position = "identity", bins = 30) +
scale_fill_brewer(palette = "Set1", name = group_var) +
theme_minimal()
} else {
p <- ggplot(df[!is.na(df[[var_name]]), ], aes(x = .data[[var_name]])) +
geom_histogram(fill = "#3b82f6", alpha = 0.7, bins = 30) +
theme_minimal()
}
p <- p + labs(title = paste("Distribution of", var_name), x = var_name, y = "Count")
tmp_file <- tempfile(fileext = ".png")
ggsave(tmp_file, p, width = 6, height = 4, dpi = 100)
base64_str <- base64encode(tmp_file)
unlink(tmp_file)
return(paste0("data:image/png;base64,", base64_str))
}
# 生成柱状图
generate_bar_chart <- function(df, var_name, group_var = NULL) {
df_plot <- df[!is.na(df[[var_name]]), ]
if (!is.null(group_var) && group_var != "") {
p <- ggplot(df_plot, aes(x = factor(.data[[var_name]]), fill = factor(.data[[group_var]]))) +
geom_bar(position = "dodge") +
scale_fill_brewer(palette = "Set1", name = group_var) +
theme_minimal()
} else {
p <- ggplot(df_plot, aes(x = factor(.data[[var_name]]))) +
geom_bar(fill = "#3b82f6", alpha = 0.7) +
theme_minimal()
}
p <- p + labs(title = paste("Frequency of", var_name), x = var_name, y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
tmp_file <- tempfile(fileext = ".png")
ggsave(tmp_file, p, width = 6, height = 4, dpi = 100)
base64_str <- base64encode(tmp_file)
unlink(tmp_file)
return(paste0("data:image/png;base64,", base64_str))
}

View File

@@ -0,0 +1,316 @@
#' @tool_code ST_LOGISTIC_BINARY
#' @name 二元 Logistic 回归
#' @version 1.0.0
#' @description 二分类结局变量的多因素分析
#' @author SSA-Pro Team
library(glue)
library(ggplot2)
library(base64enc)
run_analysis <- function(input) {
# ===== 初始化 =====
logs <- c()
log_add <- function(msg) { logs <<- c(logs, paste0("[", Sys.time(), "] ", msg)) }
on.exit({}, add = TRUE)
# ===== 数据加载 =====
log_add("开始加载输入数据")
df <- tryCatch(
load_input_data(input),
error = function(e) {
log_add(paste("数据加载失败:", e$message))
return(NULL)
}
)
if (is.null(df)) {
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "数据加载失败"))
}
log_add(glue("数据加载成功: {nrow(df)} 行, {ncol(df)} 列"))
p <- input$params
outcome_var <- p$outcome_var
predictors <- p$predictors # 预测变量列表
confounders <- p$confounders # 混杂因素(可选)
# ===== 参数校验 =====
if (!(outcome_var %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = outcome_var))
}
all_vars <- c(predictors, confounders)
all_vars <- all_vars[!is.null(all_vars) & all_vars != ""]
for (v in all_vars) {
if (!(v %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = v))
}
}
if (length(predictors) == 0) {
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "至少需要一个预测变量"))
}
# ===== 数据清洗 =====
original_rows <- nrow(df)
# 移除所有相关变量的缺失值
vars_to_check <- c(outcome_var, all_vars)
for (v in vars_to_check) {
df <- df[!is.na(df[[v]]), ]
}
removed_rows <- original_rows - nrow(df)
if (removed_rows > 0) {
log_add(glue("数据清洗: 移除 {removed_rows} 行缺失值 (剩余 {nrow(df)} 行)"))
}
# ===== 结局变量检查 =====
outcome_values <- unique(df[[outcome_var]])
if (length(outcome_values) != 2) {
return(make_error(ERROR_CODES$E003_INSUFFICIENT_GROUPS,
col = outcome_var, expected = 2, actual = length(outcome_values)))
}
# 确保结局变量是 0/1 或因子
if (!is.factor(df[[outcome_var]])) {
df[[outcome_var]] <- as.factor(df[[outcome_var]])
}
# 事件数统计
event_counts <- table(df[[outcome_var]])
n_events <- min(event_counts)
n_predictors <- length(all_vars)
log_add(glue("结局变量分布: {paste(names(event_counts), '=', event_counts, collapse=', ')}"))
log_add(glue("事件数: {n_events}, 预测变量数: {n_predictors}"))
# ===== 护栏检查 =====
guardrail_results <- list()
warnings_list <- c()
# EPV 规则检查Events Per Variable >= 10
epv <- n_events / n_predictors
if (epv < 10) {
warnings_list <- c(warnings_list, glue("EPV = {round(epv, 1)} < 10模型可能不稳定"))
log_add(glue("警告: EPV = {round(epv, 1)} < 10"))
}
# 样本量检查
sample_check <- check_sample_size(nrow(df), min_required = 20, action = ACTION_BLOCK)
guardrail_results <- c(guardrail_results, list(sample_check))
guardrail_status <- run_guardrail_chain(guardrail_results)
if (guardrail_status$status == "blocked") {
return(list(
status = "blocked",
message = guardrail_status$reason,
trace_log = logs
))
}
# ===== 构建模型公式 =====
formula_str <- paste(outcome_var, "~", paste(all_vars, collapse = " + "))
formula_obj <- as.formula(formula_str)
log_add(glue("模型公式: {formula_str}"))
# ===== 核心计算 =====
log_add("拟合 Logistic 回归模型")
model <- tryCatch({
glm(formula_obj, data = df, family = binomial(link = "logit"))
}, error = function(e) {
log_add(paste("模型拟合失败:", e$message))
return(NULL)
}, warning = function(w) {
warnings_list <<- c(warnings_list, w$message)
log_add(paste("模型警告:", w$message))
invokeRestart("muffleWarning")
})
if (is.null(model)) {
return(map_r_error("模型拟合失败"))
}
# 检查模型收敛
if (!model$converged) {
warnings_list <- c(warnings_list, "模型未完全收敛")
log_add("警告: 模型未完全收敛")
}
# ===== 提取模型结果 =====
coef_summary <- summary(model)$coefficients
# 计算 OR 和 95% CI
coef_table <- data.frame(
variable = rownames(coef_summary),
estimate = coef_summary[, "Estimate"],
std_error = coef_summary[, "Std. Error"],
z_value = coef_summary[, "z value"],
p_value = coef_summary[, "Pr(>|z|)"],
stringsAsFactors = FALSE
)
coef_table$OR <- exp(coef_table$estimate)
coef_table$ci_lower <- exp(coef_table$estimate - 1.96 * coef_table$std_error)
coef_table$ci_upper <- exp(coef_table$estimate + 1.96 * coef_table$std_error)
# 转换为列表格式(精简,不含原始系数)
coefficients_list <- lapply(1:nrow(coef_table), function(i) {
row <- coef_table[i, ]
list(
variable = row$variable,
OR = round(row$OR, 3),
ci_lower = round(row$ci_lower, 3),
ci_upper = round(row$ci_upper, 3),
p_value = round(row$p_value, 4),
p_value_fmt = format_p_value(row$p_value),
significant = row$p_value < 0.05
)
})
# ===== 模型拟合度 =====
null_deviance <- model$null.deviance
residual_deviance <- model$deviance
aic <- AIC(model)
# Nagelkerke R²伪 R²
n <- nrow(df)
r2_nagelkerke <- (1 - exp((residual_deviance - null_deviance) / n)) / (1 - exp(-null_deviance / n))
log_add(glue("AIC = {round(aic, 2)}, Nagelkerke R² = {round(r2_nagelkerke, 3)}"))
# ===== 共线性检测VIF =====
vif_results <- NULL
if (length(all_vars) > 1) {
tryCatch({
if (requireNamespace("car", quietly = TRUE)) {
vif_values <- car::vif(model)
if (is.matrix(vif_values)) {
vif_values <- vif_values[, "GVIF"]
}
vif_results <- lapply(names(vif_values), function(v) {
list(variable = v, vif = round(vif_values[v], 2))
})
high_vif <- names(vif_values)[vif_values > 5]
if (length(high_vif) > 0) {
warnings_list <- c(warnings_list, paste("VIF > 5 的变量:", paste(high_vif, collapse = ", ")))
}
}
}, error = function(e) {
log_add(paste("VIF 计算失败:", e$message))
})
}
# ===== 生成图表(森林图) =====
log_add("生成森林图")
plot_base64 <- tryCatch({
generate_forest_plot(coef_table)
}, error = function(e) {
log_add(paste("图表生成失败:", e$message))
NULL
})
# ===== 生成可复现代码 =====
original_filename <- if (!is.null(input$original_filename) && nchar(input$original_filename) > 0) {
input$original_filename
} else {
"data.csv"
}
reproducible_code <- glue('
# SSA-Pro 自动生成代码
# 工具: 二元 Logistic 回归
# 时间: {Sys.time()}
# ================================
# 数据准备
df <- read.csv("{original_filename}")
# 模型拟合
model <- glm({formula_str}, data = df, family = binomial(link = "logit"))
summary(model)
# OR 和 95% CI
coef_summary <- summary(model)$coefficients
OR <- exp(coef_summary[, "Estimate"])
CI_lower <- exp(coef_summary[, "Estimate"] - 1.96 * coef_summary[, "Std. Error"])
CI_upper <- exp(coef_summary[, "Estimate"] + 1.96 * coef_summary[, "Std. Error"])
results <- data.frame(OR = OR, CI_lower = CI_lower, CI_upper = CI_upper,
p_value = coef_summary[, "Pr(>|z|)"])
print(round(results, 3))
# 模型拟合度
cat("AIC:", AIC(model), "\\n")
# VIF需要 car 包)
# library(car)
# vif(model)
')
# ===== 返回结果 =====
log_add("分析完成")
return(list(
status = "success",
message = "分析完成",
warnings = if (length(warnings_list) > 0) warnings_list else NULL,
results = list(
method = "Binary Logistic Regression (glm, binomial)",
formula = formula_str,
n_observations = nrow(df),
n_predictors = n_predictors,
coefficients = coefficients_list,
model_fit = list(
aic = jsonlite::unbox(round(aic, 2)),
null_deviance = jsonlite::unbox(round(null_deviance, 2)),
residual_deviance = jsonlite::unbox(round(residual_deviance, 2)),
r2_nagelkerke = jsonlite::unbox(round(r2_nagelkerke, 4))
),
vif = vif_results,
epv = jsonlite::unbox(round(epv, 1))
),
plots = if (!is.null(plot_base64)) list(plot_base64) else list(),
trace_log = logs,
reproducible_code = as.character(reproducible_code)
))
}
# 辅助函数:生成森林图
generate_forest_plot <- function(coef_table) {
# 移除截距项
plot_data <- coef_table[coef_table$variable != "(Intercept)", ]
if (nrow(plot_data) == 0) {
return(NULL)
}
plot_data$variable <- factor(plot_data$variable, levels = rev(plot_data$variable))
p <- ggplot(plot_data, aes(x = OR, y = variable)) +
geom_vline(xintercept = 1, linetype = "dashed", color = "gray50") +
geom_point(size = 3, color = "#3b82f6") +
geom_errorbarh(aes(xmin = ci_lower, xmax = ci_upper), height = 0.2, color = "#3b82f6") +
scale_x_log10() +
theme_minimal() +
labs(
title = "Forest Plot: Odds Ratios with 95% CI",
x = "Odds Ratio (log scale)",
y = "Variable"
) +
theme(
panel.grid.minor = element_blank(),
axis.text.y = element_text(size = 10)
)
tmp_file <- tempfile(fileext = ".png")
ggsave(tmp_file, p, width = 8, height = max(4, nrow(plot_data) * 0.5 + 2), dpi = 100)
base64_str <- base64encode(tmp_file)
unlink(tmp_file)
return(paste0("data:image/png;base64,", base64_str))
}

View File

@@ -0,0 +1,235 @@
#' @tool_code ST_MANN_WHITNEY
#' @name Mann-Whitney U 检验
#' @version 1.0.0
#' @description 两组独立样本非参数比较Wilcoxon秩和检验
#' @author SSA-Pro Team
library(glue)
library(ggplot2)
library(base64enc)
run_analysis <- function(input) {
# ===== 初始化 =====
logs <- c()
log_add <- function(msg) { logs <<- c(logs, paste0("[", Sys.time(), "] ", msg)) }
on.exit({}, add = TRUE)
# ===== 数据加载 =====
log_add("开始加载输入数据")
df <- tryCatch(
load_input_data(input),
error = function(e) {
log_add(paste("数据加载失败:", e$message))
return(NULL)
}
)
if (is.null(df)) {
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "数据加载失败"))
}
log_add(glue("数据加载成功: {nrow(df)} 行, {ncol(df)} 列"))
p <- input$params
group_var <- p$group_var
value_var <- p$value_var
# ===== 参数校验 =====
if (!(group_var %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = group_var))
}
if (!(value_var %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = value_var))
}
# ===== 数据清洗 =====
original_rows <- nrow(df)
df <- df[!is.na(df[[group_var]]) & trimws(as.character(df[[group_var]])) != "", ]
df <- df[!is.na(df[[value_var]]), ]
removed_rows <- original_rows - nrow(df)
if (removed_rows > 0) {
log_add(glue("数据清洗: 移除 {removed_rows} 行缺失值 (剩余 {nrow(df)} 行)"))
}
# 分组检查
groups <- unique(df[[group_var]])
if (length(groups) != 2) {
return(make_error(ERROR_CODES$E003_INSUFFICIENT_GROUPS,
col = group_var, expected = 2, actual = length(groups)))
}
# 提取两组数据
g1_vals <- df[df[[group_var]] == groups[1], value_var]
g2_vals <- df[df[[group_var]] == groups[2], value_var]
# ===== 护栏检查 =====
guardrail_results <- list()
warnings_list <- c()
# 样本量检查每组至少5个
min_n <- min(length(g1_vals), length(g2_vals))
sample_check <- check_sample_size(min_n, min_required = 5, action = ACTION_BLOCK)
guardrail_results <- c(guardrail_results, list(sample_check))
log_add(glue("样本量检查: 每组最小 {min_n}, {sample_check$reason}"))
guardrail_status <- run_guardrail_chain(guardrail_results)
if (guardrail_status$status == "blocked") {
return(list(
status = "blocked",
message = guardrail_status$reason,
trace_log = logs
))
}
if (length(guardrail_status$warnings) > 0) {
warnings_list <- c(warnings_list, guardrail_status$warnings)
}
# ===== 核心计算 =====
log_add("执行 Mann-Whitney U 检验 (Wilcoxon rank-sum test)")
result <- tryCatch({
wilcox.test(g1_vals, g2_vals, exact = FALSE, correct = TRUE)
}, error = function(e) {
log_add(paste("Mann-Whitney U 检验失败:", e$message))
return(NULL)
})
if (is.null(result)) {
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "Mann-Whitney U 检验执行失败"))
}
# 计算效应量 r = Z / sqrt(N)
n1 <- length(g1_vals)
n2 <- length(g2_vals)
N <- n1 + n2
# 从 U 统计量计算 Z 值
U <- result$statistic
mu <- n1 * n2 / 2
sigma <- sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
z_value <- (U - mu) / sigma
effect_r <- abs(z_value) / sqrt(N)
# 效应量解释
effect_interpretation <- if (effect_r < 0.1) "微小" else if (effect_r < 0.3) "小" else if (effect_r < 0.5) "中等" else "大"
log_add(glue("U = {round(U, 2)}, Z = {round(z_value, 3)}, p = {round(result$p.value, 4)}, r = {round(effect_r, 3)}"))
# ===== 生成图表 =====
log_add("生成箱线图")
plot_base64 <- tryCatch({
generate_boxplot(df, group_var, value_var)
}, error = function(e) {
log_add(paste("图表生成失败:", e$message))
NULL
})
# ===== 生成可复现代码 =====
original_filename <- if (!is.null(input$original_filename) && nchar(input$original_filename) > 0) {
input$original_filename
} else {
"data.csv"
}
reproducible_code <- glue('
# SSA-Pro 自动生成代码
# 工具: Mann-Whitney U 检验
# 时间: {Sys.time()}
# ================================
library(ggplot2)
# 数据准备
df <- read.csv("{original_filename}")
group_var <- "{group_var}"
value_var <- "{value_var}"
# 数据清洗
df <- df[!is.na(df[[group_var]]) & !is.na(df[[value_var]]), ]
# Mann-Whitney U 检验
g1_vals <- df[df[[group_var]] == "{groups[1]}", value_var]
g2_vals <- df[df[[group_var]] == "{groups[2]}", value_var]
result <- wilcox.test(g1_vals, g2_vals, exact = FALSE, correct = TRUE)
print(result)
# 计算效应量 r
n1 <- length(g1_vals)
n2 <- length(g2_vals)
U <- result$statistic
mu <- n1 * n2 / 2
sigma <- sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
z_value <- (U - mu) / sigma
effect_r <- abs(z_value) / sqrt(n1 + n2)
cat("Effect size r =", round(effect_r, 3), "\\n")
# 可视化
ggplot(df, aes(x = .data[[group_var]], y = .data[[value_var]])) +
geom_boxplot(fill = "#8b5cf6", alpha = 0.6) +
theme_minimal() +
labs(title = paste("Distribution of", value_var, "by", group_var))
')
# ===== 返回结果 =====
log_add("分析完成")
return(list(
status = "success",
message = "分析完成",
warnings = if (length(warnings_list) > 0) warnings_list else NULL,
results = list(
method = "Wilcoxon rank sum test with continuity correction",
statistic_U = jsonlite::unbox(as.numeric(U)),
z_value = jsonlite::unbox(round(z_value, 4)),
p_value = jsonlite::unbox(as.numeric(result$p.value)),
p_value_fmt = format_p_value(result$p.value),
effect_size = list(
r = jsonlite::unbox(round(effect_r, 4)),
interpretation = effect_interpretation
),
group_stats = list(
list(
group = as.character(groups[1]),
n = n1,
median = median(g1_vals),
iqr = IQR(g1_vals),
min = min(g1_vals),
max = max(g1_vals)
),
list(
group = as.character(groups[2]),
n = n2,
median = median(g2_vals),
iqr = IQR(g2_vals),
min = min(g2_vals),
max = max(g2_vals)
)
)
),
plots = if (!is.null(plot_base64)) list(plot_base64) else list(),
trace_log = logs,
reproducible_code = as.character(reproducible_code)
))
}
# 辅助函数:生成箱线图
generate_boxplot <- function(df, group_var, value_var) {
p <- ggplot(df, aes(x = .data[[group_var]], y = .data[[value_var]])) +
geom_boxplot(fill = "#8b5cf6", alpha = 0.6) +
geom_jitter(width = 0.2, alpha = 0.3, size = 1) +
theme_minimal() +
labs(
title = paste("Distribution of", value_var, "by", group_var),
subtitle = "Mann-Whitney U Test"
)
tmp_file <- tempfile(fileext = ".png")
ggsave(tmp_file, p, width = 6, height = 4, dpi = 100)
base64_str <- base64encode(tmp_file)
unlink(tmp_file)
return(paste0("data:image/png;base64,", base64_str))
}

View File

@@ -0,0 +1,274 @@
#' @tool_code ST_T_TEST_PAIRED
#' @name 配对 T 检验
#' @version 1.0.0
#' @description 配对设计的均值差异检验(前后对比)
#' @author SSA-Pro Team
library(glue)
library(ggplot2)
library(base64enc)
run_analysis <- function(input) {
# ===== 初始化 =====
logs <- c()
log_add <- function(msg) { logs <<- c(logs, paste0("[", Sys.time(), "] ", msg)) }
on.exit({}, add = TRUE)
# ===== 数据加载 =====
log_add("开始加载输入数据")
df <- tryCatch(
load_input_data(input),
error = function(e) {
log_add(paste("数据加载失败:", e$message))
return(NULL)
}
)
if (is.null(df)) {
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "数据加载失败"))
}
log_add(glue("数据加载成功: {nrow(df)} 行, {ncol(df)} 列"))
p <- input$params
guardrails_cfg <- input$guardrails
before_var <- p$before_var
after_var <- p$after_var
# ===== 参数校验 =====
if (!(before_var %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = before_var))
}
if (!(after_var %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = after_var))
}
# ===== 数据清洗 =====
original_rows <- nrow(df)
df <- df[!is.na(df[[before_var]]) & !is.na(df[[after_var]]), ]
removed_rows <- original_rows - nrow(df)
if (removed_rows > 0) {
log_add(glue("数据清洗: 移除 {removed_rows} 行缺失值 (剩余 {nrow(df)} 行)"))
}
before_vals <- df[[before_var]]
after_vals <- df[[after_var]]
diff_vals <- after_vals - before_vals
n <- length(diff_vals)
# ===== 护栏检查 =====
guardrail_results <- list()
warnings_list <- c()
method_used <- "t.test"
use_wilcoxon <- FALSE
# 样本量检查
sample_check <- check_sample_size(n, min_required = 10, action = ACTION_WARN)
guardrail_results <- c(guardrail_results, list(sample_check))
log_add(glue("样本量检查: N = {n}, {sample_check$reason}"))
# 差值正态性检验
if (isTRUE(guardrails_cfg$check_normality) && n >= 3) {
log_add("执行差值正态性检验")
norm_check <- check_normality(diff_vals, alpha = 0.05,
action = ACTION_SWITCH,
action_target = "Wilcoxon signed-rank test")
guardrail_results <- c(guardrail_results, list(norm_check))
log_add(glue("差值正态性: p = {round(norm_check$p_value, 4)}, {norm_check$reason}"))
}
guardrail_status <- run_guardrail_chain(guardrail_results)
if (guardrail_status$status == "blocked") {
return(list(
status = "blocked",
message = guardrail_status$reason,
trace_log = logs
))
}
if (guardrail_status$status == "switch") {
use_wilcoxon <- TRUE
log_add(glue("触发方法切换: {guardrail_status$reason}"))
warnings_list <- c(warnings_list, "差值不满足正态性,自动切换为 Wilcoxon 符号秩检验")
}
if (length(guardrail_status$warnings) > 0) {
warnings_list <- c(warnings_list, guardrail_status$warnings)
}
# ===== 核心计算 =====
if (use_wilcoxon) {
log_add("执行 Wilcoxon 符号秩检验")
result <- wilcox.test(before_vals, after_vals, paired = TRUE, exact = FALSE)
method_used <- "Wilcoxon signed rank test"
# Wilcoxon 效应量 r
z_value <- qnorm(result$p.value / 2) * sign(median(diff_vals))
effect_r <- abs(z_value) / sqrt(n)
effect_interpretation <- if (abs(effect_r) < 0.1) "微小" else if (abs(effect_r) < 0.3) "小" else if (abs(effect_r) < 0.5) "中等" else "大"
output_results <- list(
method = method_used,
statistic = jsonlite::unbox(as.numeric(result$statistic)),
p_value = jsonlite::unbox(as.numeric(result$p.value)),
p_value_fmt = format_p_value(result$p.value),
effect_size = list(
r = jsonlite::unbox(round(effect_r, 4)),
interpretation = effect_interpretation
)
)
} else {
log_add("执行配对 T 检验")
result <- t.test(before_vals, after_vals, paired = TRUE)
method_used <- "Paired t-test"
# Cohen's d for paired samples
mean_diff <- mean(diff_vals)
sd_diff <- sd(diff_vals)
cohens_d <- mean_diff / sd_diff
effect_interpretation <- if (abs(cohens_d) < 0.2) "微小" else if (abs(cohens_d) < 0.5) "小" else if (abs(cohens_d) < 0.8) "中等" else "大"
log_add(glue("t = {round(result$statistic, 3)}, df = {round(result$parameter, 1)}, p = {round(result$p.value, 4)}, Cohen's d = {round(cohens_d, 3)}"))
output_results <- list(
method = method_used,
statistic = jsonlite::unbox(as.numeric(result$statistic)),
df = jsonlite::unbox(as.numeric(result$parameter)),
p_value = jsonlite::unbox(as.numeric(result$p.value)),
p_value_fmt = format_p_value(result$p.value),
conf_int = as.numeric(result$conf.int),
effect_size = list(
cohens_d = jsonlite::unbox(round(cohens_d, 4)),
interpretation = effect_interpretation
)
)
}
# 添加描述性统计
output_results$descriptive <- list(
before = list(
variable = before_var,
n = n,
mean = round(mean(before_vals), 3),
sd = round(sd(before_vals), 3),
median = round(median(before_vals), 3)
),
after = list(
variable = after_var,
n = n,
mean = round(mean(after_vals), 3),
sd = round(sd(after_vals), 3),
median = round(median(after_vals), 3)
),
difference = list(
mean = round(mean(diff_vals), 3),
sd = round(sd(diff_vals), 3),
median = round(median(diff_vals), 3)
)
)
# ===== 生成图表 =====
log_add("生成配对比较图")
plot_base64 <- tryCatch({
generate_paired_plot(df, before_var, after_var, diff_vals)
}, error = function(e) {
log_add(paste("图表生成失败:", e$message))
NULL
})
# ===== 生成可复现代码 =====
original_filename <- if (!is.null(input$original_filename) && nchar(input$original_filename) > 0) {
input$original_filename
} else {
"data.csv"
}
reproducible_code <- glue('
# SSA-Pro 自动生成代码
# 工具: 配对 T 检验
# 时间: {Sys.time()}
# ================================
library(ggplot2)
# 数据准备
df <- read.csv("{original_filename}")
before_var <- "{before_var}"
after_var <- "{after_var}"
# 数据清洗
df <- df[!is.na(df[[before_var]]) & !is.na(df[[after_var]]), ]
# 配对 T 检验
before_vals <- df[[before_var]]
after_vals <- df[[after_var]]
result <- t.test(before_vals, after_vals, paired = TRUE)
print(result)
# Cohen d (效应量)
diff_vals <- after_vals - before_vals
cohens_d <- mean(diff_vals) / sd(diff_vals)
cat("Cohen d =", round(cohens_d, 3), "\\n")
# 可视化
df_long <- data.frame(
id = rep(1:nrow(df), 2),
time = rep(c("Before", "After"), each = nrow(df)),
value = c(before_vals, after_vals)
)
ggplot(df_long, aes(x = time, y = value, group = id)) +
geom_line(alpha = 0.3) +
geom_point() +
theme_minimal() +
labs(title = "Paired Comparison")
')
# ===== 返回结果 =====
log_add("分析完成")
return(list(
status = "success",
message = "分析完成",
warnings = if (length(warnings_list) > 0) warnings_list else NULL,
results = output_results,
plots = if (!is.null(plot_base64)) list(plot_base64) else list(),
trace_log = logs,
reproducible_code = as.character(reproducible_code)
))
}
# 辅助函数:生成配对比较图
generate_paired_plot <- function(df, before_var, after_var, diff_vals) {
n <- nrow(df)
# 创建长格式数据
df_long <- data.frame(
id = rep(1:n, 2),
time = factor(rep(c("Before", "After"), each = n), levels = c("Before", "After")),
value = c(df[[before_var]], df[[after_var]])
)
p <- ggplot(df_long, aes(x = time, y = value)) +
geom_line(aes(group = id), alpha = 0.3, color = "gray60") +
geom_point(aes(group = id), alpha = 0.5, size = 2) +
stat_summary(fun = mean, geom = "point", size = 4, color = "#ef4444", shape = 18) +
stat_summary(fun = mean, geom = "line", aes(group = 1), color = "#ef4444", size = 1.2) +
theme_minimal() +
labs(
title = paste("Paired Comparison:", before_var, "vs", after_var),
subtitle = paste("n =", n, ", Mean change =", round(mean(diff_vals), 2)),
x = "Time Point",
y = "Value"
)
tmp_file <- tempfile(fileext = ".png")
ggsave(tmp_file, p, width = 6, height = 5, dpi = 100)
base64_str <- base64encode(tmp_file)
unlink(tmp_file)
return(paste0("data:image/png;base64,", base64_str))
}