Files
HaHafeng 3446909ff7 feat(ssa): Complete Phase I-IV intelligent dialogue and tool system development
Phase I - Session Blackboard + READ Layer:
- SessionBlackboardService with Postgres-Only cache
- DataProfileService for data overview generation
- PicoInferenceService for LLM-driven PICO extraction
- Frontend DataContextCard and VariableDictionaryPanel
- E2E tests: 31/31 passed

Phase II - Conversation Layer LLM + Intent Router:
- ConversationService with SSE streaming
- IntentRouterService (rule-first + LLM fallback, 6 intents)
- SystemPromptService with 6-segment dynamic assembly
- TokenTruncationService for context management
- ChatHandlerService as unified chat entry
- Frontend SSAChatPane and useSSAChat hook
- E2E tests: 38/38 passed

Phase III - Method Consultation + AskUser Standardization:
- ToolRegistryService with Repository Pattern
- MethodConsultService with DecisionTable + LLM enhancement
- AskUserService with global interrupt handling
- Frontend AskUserCard component
- E2E tests: 13/13 passed

Phase IV - Dialogue-Driven Analysis + QPER Integration:
- ToolOrchestratorService (plan/execute/report)
- analysis_plan SSE event for WorkflowPlan transmission
- Dual-channel confirmation (ask_user card + workspace button)
- PICO as optional hint for LLM parsing
- E2E tests: 25/25 passed

R Statistics Service:
- 5 new R tools: anova_one, baseline_table, fisher, linear_reg, wilcoxon
- Enhanced guardrails and block helpers
- Comprehensive test suite (run_all_tools_test.js)

Documentation:
- Updated system status document (v5.9)
- Updated SSA module status and development plan (v1.8)

Total E2E: 107/107 passed (Phase I: 31, Phase II: 38, Phase III: 13, Phase IV: 25)

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-22 18:53:39 +08:00

287 lines
9.6 KiB
R
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#' @tool_code ST_WILCOXON
#' @name Wilcoxon 符号秩检验
#' @version 1.0.0
#' @description 配对样本的非参数检验(配对 T 检验的替代方法)
#' @author SSA-Pro Team
library(glue)
library(ggplot2)
library(base64enc)
run_analysis <- function(input) {
# ===== 初始化 =====
logs <- c()
log_add <- function(msg) { logs <<- c(logs, paste0("[", Sys.time(), "] ", msg)) }
on.exit({}, add = TRUE)
# ===== 数据加载 =====
log_add("开始加载输入数据")
df <- tryCatch(
load_input_data(input),
error = function(e) {
log_add(paste("数据加载失败:", e$message))
return(NULL)
}
)
if (is.null(df)) {
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "数据加载失败"))
}
log_add(glue("数据加载成功: {nrow(df)} 行, {ncol(df)} 列"))
p <- input$params
before_var <- p$before_var
after_var <- p$after_var
# ===== 参数校验 =====
if (!(before_var %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = before_var))
}
if (!(after_var %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = after_var))
}
# ===== 数据清洗 =====
original_rows <- nrow(df)
df <- df[!is.na(df[[before_var]]) & !is.na(df[[after_var]]), ]
# 确保数值型
if (!is.numeric(df[[before_var]])) {
df[[before_var]] <- as.numeric(as.character(df[[before_var]]))
df <- df[!is.na(df[[before_var]]), ]
}
if (!is.numeric(df[[after_var]])) {
df[[after_var]] <- as.numeric(as.character(df[[after_var]]))
df <- df[!is.na(df[[after_var]]), ]
}
removed_rows <- original_rows - nrow(df)
if (removed_rows > 0) {
log_add(glue("数据清洗: 移除 {removed_rows} 行缺失值 (剩余 {nrow(df)} 行)"))
}
# ===== 护栏检查 =====
guardrail_results <- list()
warnings_list <- c()
sample_check <- check_sample_size(nrow(df), min_required = 5, action = ACTION_BLOCK)
guardrail_results <- c(guardrail_results, list(sample_check))
log_add(glue("配对样本量: N = {nrow(df)}, {sample_check$reason}"))
guardrail_status <- run_guardrail_chain(guardrail_results)
if (guardrail_status$status == "blocked") {
return(list(status = "blocked", message = guardrail_status$reason, trace_log = logs))
}
# ===== 计算差值 =====
diff_values <- df[[after_var]] - df[[before_var]]
# 检查差值方差(容差比较避免浮点精度问题)
if (isTRUE(sd(diff_values) < .Machine$double.eps^0.5)) {
return(make_error(ERROR_CODES$E007_VARIANCE_ZERO, col = paste(after_var, "-", before_var)))
}
# ===== 核心计算 =====
log_add("执行 Wilcoxon 符号秩检验")
result <- tryCatch({
wilcox.test(df[[before_var]], df[[after_var]], paired = TRUE, conf.int = TRUE)
}, error = function(e) {
log_add(paste("Wilcoxon 检验失败:", e$message))
return(NULL)
})
if (is.null(result)) {
return(map_r_error("Wilcoxon 符号秩检验计算失败"))
}
method_used <- result$method
log_add(glue("V = {result$statistic}, P = {round(result$p.value, 4)}"))
# ===== 效应量: r = Z / sqrt(N) =====
n_pairs <- nrow(df)
z_approx <- qnorm(result$p.value / 2)
r_effect <- abs(z_approx) / sqrt(n_pairs)
r_interpretation <- if (r_effect < 0.1) "微小" else if (r_effect < 0.3) "小" else if (r_effect < 0.5) "中等" else "大"
# ===== 描述统计 =====
before_vals <- df[[before_var]]
after_vals <- df[[after_var]]
desc_stats <- list(
before = list(
variable = before_var,
n = length(before_vals),
mean = round(mean(before_vals), 3),
sd = round(sd(before_vals), 3),
median = round(median(before_vals), 3),
q1 = round(quantile(before_vals, 0.25), 3),
q3 = round(quantile(before_vals, 0.75), 3)
),
after = list(
variable = after_var,
n = length(after_vals),
mean = round(mean(after_vals), 3),
sd = round(sd(after_vals), 3),
median = round(median(after_vals), 3),
q1 = round(quantile(after_vals, 0.25), 3),
q3 = round(quantile(after_vals, 0.75), 3)
),
difference = list(
mean = round(mean(diff_values), 3),
sd = round(sd(diff_values), 3),
median = round(median(diff_values), 3)
)
)
output_results <- list(
method = method_used,
statistic_V = jsonlite::unbox(as.numeric(result$statistic)),
p_value = jsonlite::unbox(as.numeric(result$p.value)),
p_value_fmt = format_p_value(result$p.value),
pseudomedian = if (!is.null(result$estimate)) jsonlite::unbox(round(as.numeric(result$estimate), 4)) else NULL,
conf_int = if (!is.null(result$conf.int)) round(as.numeric(result$conf.int), 4) else NULL,
effect_size = list(
r = jsonlite::unbox(round(r_effect, 4)),
interpretation = r_interpretation
),
descriptive = desc_stats
)
# ===== 生成图表 =====
log_add("生成配对变化图")
plot_base64 <- tryCatch({
generate_paired_plot(df, before_var, after_var, diff_values)
}, error = function(e) {
log_add(paste("图表生成失败:", e$message))
NULL
})
# ===== 生成可复现代码 =====
original_filename <- if (!is.null(input$original_filename) && nchar(input$original_filename) > 0) {
input$original_filename
} else {
"data.csv"
}
reproducible_code <- glue('
# SSA-Pro 自动生成代码
# 工具: Wilcoxon 符号秩检验
# 时间: {Sys.time()}
# ================================
library(ggplot2)
# 数据准备
df <- read.csv("{original_filename}")
before_var <- "{before_var}"
after_var <- "{after_var}"
# 数据清洗
df <- df[!is.na(df[[before_var]]) & !is.na(df[[after_var]]), ]
# Wilcoxon 符号秩检验
result <- wilcox.test(df[[before_var]], df[[after_var]], paired = TRUE, conf.int = TRUE)
print(result)
# 描述统计
cat("Before: median =", median(df[[before_var]]), "\\n")
cat("After: median =", median(df[[after_var]]), "\\n")
cat("Diff: median =", median(df[[after_var]] - df[[before_var]]), "\\n")
')
# ===== 构建 report_blocks =====
blocks <- list()
# Block 1: 描述统计
desc_kv <- list()
desc_kv[["配对样本量"]] <- as.character(n_pairs)
desc_kv[[paste0(before_var, " Median [Q1, Q3]")]] <- as.character(glue("{desc_stats$before$median} [{desc_stats$before$q1}, {desc_stats$before$q3}]"))
desc_kv[[paste0(after_var, " Median [Q1, Q3]")]] <- as.character(glue("{desc_stats$after$median} [{desc_stats$after$q1}, {desc_stats$after$q3}]"))
desc_kv[["差值 Median"]] <- as.character(desc_stats$difference$median)
blocks[[length(blocks) + 1]] <- make_kv_block(desc_kv, title = "样本概况")
# Block 2: 检验结果
kv_result <- list(
"方法" = method_used,
"V 统计量" = as.character(round(as.numeric(result$statistic), 1)),
"P 值" = output_results$p_value_fmt,
"效应量 r" = as.character(output_results$effect_size$r),
"效应量解释" = r_interpretation
)
if (!is.null(output_results$pseudomedian)) {
kv_result[["伪中位数"]] <- as.character(output_results$pseudomedian)
}
if (!is.null(output_results$conf_int)) {
kv_result[["95% 置信区间"]] <- sprintf("[%.4f, %.4f]", output_results$conf_int[1], output_results$conf_int[2])
}
blocks[[length(blocks) + 1]] <- make_kv_block(kv_result, title = "Wilcoxon 符号秩检验结果")
# Block 3: 图表
if (!is.null(plot_base64)) {
blocks[[length(blocks) + 1]] <- make_image_block(plot_base64,
title = paste("配对变化:", before_var, "→", after_var),
alt = "配对样本前后变化图")
}
# Block 4: 结论摘要
sig_text <- if (result$p.value < 0.05) "差异具有统计学意义" else "差异无统计学意义"
direction <- if (mean(diff_values) > 0) "升高" else "降低"
conclusion <- glue(
"Wilcoxon 符号秩检验结果V = {round(as.numeric(result$statistic), 1)}P {output_results$p_value_fmt}。",
"配对样本从 **{before_var}** 到 **{after_var}** 的变化{sig_text}",
"(中位数{direction} {abs(desc_stats$difference$median)},效应量 r = {output_results$effect_size$r}{r_interpretation}效应)。"
)
blocks[[length(blocks) + 1]] <- make_markdown_block(conclusion, title = "结论摘要")
# ===== 返回结果 =====
log_add("分析完成")
return(list(
status = "success",
message = "分析完成",
warnings = if (length(warnings_list) > 0) warnings_list else NULL,
results = output_results,
report_blocks = blocks,
plots = if (!is.null(plot_base64)) list(plot_base64) else list(),
trace_log = logs,
reproducible_code = as.character(reproducible_code)
))
}
# 辅助函数:配对变化图(差值直方图 + 配对连线图)
generate_paired_plot <- function(df, before_var, after_var, diff_values) {
# 配对连线图
n <- nrow(df)
plot_df <- data.frame(
id = rep(1:n, 2),
time = rep(c("Before", "After"), each = n),
value = c(df[[before_var]], df[[after_var]])
)
plot_df$time <- factor(plot_df$time, levels = c("Before", "After"))
p <- ggplot(plot_df, aes(x = time, y = value)) +
geom_line(aes(group = id), alpha = 0.3, color = "gray60") +
geom_point(aes(color = time), size = 2, alpha = 0.6) +
stat_summary(fun = median, geom = "point", shape = 18, size = 5, color = "red") +
stat_summary(fun = median, geom = "line", aes(group = 1), color = "red", linewidth = 1.2) +
theme_minimal() +
labs(
title = paste("Paired Change:", before_var, "→", after_var),
x = "",
y = "Value"
) +
scale_color_manual(values = c("Before" = "#3b82f6", "After" = "#ef4444")) +
theme(legend.position = "none")
tmp_file <- tempfile(fileext = ".png")
ggsave(tmp_file, p, width = 6, height = 5, dpi = 100)
base64_str <- base64encode(tmp_file)
unlink(tmp_file)
return(paste0("data:image/png;base64,", base64_str))
}