Files
AIclinicalresearch/r-statistics-service/tools/mann_whitney.R
HaHafeng 428a22adf2 feat(ssa): Complete Phase 2A frontend integration - multi-step workflow end-to-end
Phase 2A: WorkflowPlannerService, WorkflowExecutorService, Python data quality, 6 bug fixes, DescriptiveResultView, multi-step R code/Word export, MVP UI reuse. V11 UI: Gemini-style, multi-task, single-page scroll, Word export. Architecture: Block-based rendering consensus (4 block types). New R tools: chi_square, correlation, descriptive, logistic_binary, mann_whitney, t_test_paired. Docs: dev summary, block-based plan, status updates, task list v2.0.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-20 23:09:27 +08:00

236 lines
6.9 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#' @tool_code ST_MANN_WHITNEY
#' @name Mann-Whitney U 检验
#' @version 1.0.0
#' @description 两组独立样本非参数比较Wilcoxon秩和检验
#' @author SSA-Pro Team
library(glue)
library(ggplot2)
library(base64enc)
run_analysis <- function(input) {
# ===== 初始化 =====
logs <- c()
log_add <- function(msg) { logs <<- c(logs, paste0("[", Sys.time(), "] ", msg)) }
on.exit({}, add = TRUE)
# ===== 数据加载 =====
log_add("开始加载输入数据")
df <- tryCatch(
load_input_data(input),
error = function(e) {
log_add(paste("数据加载失败:", e$message))
return(NULL)
}
)
if (is.null(df)) {
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "数据加载失败"))
}
log_add(glue("数据加载成功: {nrow(df)} 行, {ncol(df)} 列"))
p <- input$params
group_var <- p$group_var
value_var <- p$value_var
# ===== 参数校验 =====
if (!(group_var %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = group_var))
}
if (!(value_var %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = value_var))
}
# ===== 数据清洗 =====
original_rows <- nrow(df)
df <- df[!is.na(df[[group_var]]) & trimws(as.character(df[[group_var]])) != "", ]
df <- df[!is.na(df[[value_var]]), ]
removed_rows <- original_rows - nrow(df)
if (removed_rows > 0) {
log_add(glue("数据清洗: 移除 {removed_rows} 行缺失值 (剩余 {nrow(df)} 行)"))
}
# 分组检查
groups <- unique(df[[group_var]])
if (length(groups) != 2) {
return(make_error(ERROR_CODES$E003_INSUFFICIENT_GROUPS,
col = group_var, expected = 2, actual = length(groups)))
}
# 提取两组数据
g1_vals <- df[df[[group_var]] == groups[1], value_var]
g2_vals <- df[df[[group_var]] == groups[2], value_var]
# ===== 护栏检查 =====
guardrail_results <- list()
warnings_list <- c()
# 样本量检查每组至少5个
min_n <- min(length(g1_vals), length(g2_vals))
sample_check <- check_sample_size(min_n, min_required = 5, action = ACTION_BLOCK)
guardrail_results <- c(guardrail_results, list(sample_check))
log_add(glue("样本量检查: 每组最小 {min_n}, {sample_check$reason}"))
guardrail_status <- run_guardrail_chain(guardrail_results)
if (guardrail_status$status == "blocked") {
return(list(
status = "blocked",
message = guardrail_status$reason,
trace_log = logs
))
}
if (length(guardrail_status$warnings) > 0) {
warnings_list <- c(warnings_list, guardrail_status$warnings)
}
# ===== 核心计算 =====
log_add("执行 Mann-Whitney U 检验 (Wilcoxon rank-sum test)")
result <- tryCatch({
wilcox.test(g1_vals, g2_vals, exact = FALSE, correct = TRUE)
}, error = function(e) {
log_add(paste("Mann-Whitney U 检验失败:", e$message))
return(NULL)
})
if (is.null(result)) {
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "Mann-Whitney U 检验执行失败"))
}
# 计算效应量 r = Z / sqrt(N)
n1 <- length(g1_vals)
n2 <- length(g2_vals)
N <- n1 + n2
# 从 U 统计量计算 Z 值
U <- result$statistic
mu <- n1 * n2 / 2
sigma <- sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
z_value <- (U - mu) / sigma
effect_r <- abs(z_value) / sqrt(N)
# 效应量解释
effect_interpretation <- if (effect_r < 0.1) "微小" else if (effect_r < 0.3) "小" else if (effect_r < 0.5) "中等" else "大"
log_add(glue("U = {round(U, 2)}, Z = {round(z_value, 3)}, p = {round(result$p.value, 4)}, r = {round(effect_r, 3)}"))
# ===== 生成图表 =====
log_add("生成箱线图")
plot_base64 <- tryCatch({
generate_boxplot(df, group_var, value_var)
}, error = function(e) {
log_add(paste("图表生成失败:", e$message))
NULL
})
# ===== 生成可复现代码 =====
original_filename <- if (!is.null(input$original_filename) && nchar(input$original_filename) > 0) {
input$original_filename
} else {
"data.csv"
}
reproducible_code <- glue('
# SSA-Pro 自动生成代码
# 工具: Mann-Whitney U 检验
# 时间: {Sys.time()}
# ================================
library(ggplot2)
# 数据准备
df <- read.csv("{original_filename}")
group_var <- "{group_var}"
value_var <- "{value_var}"
# 数据清洗
df <- df[!is.na(df[[group_var]]) & !is.na(df[[value_var]]), ]
# Mann-Whitney U 检验
g1_vals <- df[df[[group_var]] == "{groups[1]}", value_var]
g2_vals <- df[df[[group_var]] == "{groups[2]}", value_var]
result <- wilcox.test(g1_vals, g2_vals, exact = FALSE, correct = TRUE)
print(result)
# 计算效应量 r
n1 <- length(g1_vals)
n2 <- length(g2_vals)
U <- result$statistic
mu <- n1 * n2 / 2
sigma <- sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
z_value <- (U - mu) / sigma
effect_r <- abs(z_value) / sqrt(n1 + n2)
cat("Effect size r =", round(effect_r, 3), "\\n")
# 可视化
ggplot(df, aes(x = .data[[group_var]], y = .data[[value_var]])) +
geom_boxplot(fill = "#8b5cf6", alpha = 0.6) +
theme_minimal() +
labs(title = paste("Distribution of", value_var, "by", group_var))
')
# ===== 返回结果 =====
log_add("分析完成")
return(list(
status = "success",
message = "分析完成",
warnings = if (length(warnings_list) > 0) warnings_list else NULL,
results = list(
method = "Wilcoxon rank sum test with continuity correction",
statistic_U = jsonlite::unbox(as.numeric(U)),
z_value = jsonlite::unbox(round(z_value, 4)),
p_value = jsonlite::unbox(as.numeric(result$p.value)),
p_value_fmt = format_p_value(result$p.value),
effect_size = list(
r = jsonlite::unbox(round(effect_r, 4)),
interpretation = effect_interpretation
),
group_stats = list(
list(
group = as.character(groups[1]),
n = n1,
median = median(g1_vals),
iqr = IQR(g1_vals),
min = min(g1_vals),
max = max(g1_vals)
),
list(
group = as.character(groups[2]),
n = n2,
median = median(g2_vals),
iqr = IQR(g2_vals),
min = min(g2_vals),
max = max(g2_vals)
)
)
),
plots = if (!is.null(plot_base64)) list(plot_base64) else list(),
trace_log = logs,
reproducible_code = as.character(reproducible_code)
))
}
# 辅助函数:生成箱线图
generate_boxplot <- function(df, group_var, value_var) {
p <- ggplot(df, aes(x = .data[[group_var]], y = .data[[value_var]])) +
geom_boxplot(fill = "#8b5cf6", alpha = 0.6) +
geom_jitter(width = 0.2, alpha = 0.3, size = 1) +
theme_minimal() +
labs(
title = paste("Distribution of", value_var, "by", group_var),
subtitle = "Mann-Whitney U Test"
)
tmp_file <- tempfile(fileext = ".png")
ggsave(tmp_file, p, width = 6, height = 4, dpi = 100)
base64_str <- base64encode(tmp_file)
unlink(tmp_file)
return(paste0("data:image/png;base64,", base64_str))
}