Files
AIclinicalresearch/r-statistics-service/tools/mann_whitney.R
HaHafeng 371e1c069c feat(ssa): Complete QPER architecture - Query, Planner, Execute, Reflection layers
Implement the full QPER intelligent analysis pipeline:

- Phase E+: Block-based standardization for all 7 R tools, DynamicReport renderer, Word export enhancement

- Phase Q: LLM intent parsing with dynamic Zod validation against real column names, ClarificationCard component, DataProfile is_id_like tagging

- Phase P: ConfigLoader with Zod schema validation and hot-reload API, DecisionTableService (4-dimension matching), FlowTemplateService with EPV protection, PlannedTrace audit output

- Phase R: ReflectionService with statistical slot injection, sensitivity analysis conflict rules, ConclusionReport with section reveal animation, conclusion caching API, graceful R error classification

End-to-end test: 40/40 passed across two complete analysis scenarios.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-21 18:15:53 +08:00

293 lines
8.9 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#' @tool_code ST_MANN_WHITNEY
#' @name Mann-Whitney U 检验
#' @version 1.0.0
#' @description 两组独立样本非参数比较Wilcoxon秩和检验
#' @author SSA-Pro Team
library(glue)
library(ggplot2)
library(base64enc)
run_analysis <- function(input) {
# ===== 初始化 =====
logs <- c()
log_add <- function(msg) { logs <<- c(logs, paste0("[", Sys.time(), "] ", msg)) }
on.exit({}, add = TRUE)
# ===== 数据加载 =====
log_add("开始加载输入数据")
df <- tryCatch(
load_input_data(input),
error = function(e) {
log_add(paste("数据加载失败:", e$message))
return(NULL)
}
)
if (is.null(df)) {
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "数据加载失败"))
}
log_add(glue("数据加载成功: {nrow(df)} 行, {ncol(df)} 列"))
p <- input$params
group_var <- p$group_var
value_var <- p$value_var
# ===== 参数校验 =====
if (!(group_var %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = group_var))
}
if (!(value_var %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = value_var))
}
# ===== 数据清洗 =====
original_rows <- nrow(df)
df <- df[!is.na(df[[group_var]]) & trimws(as.character(df[[group_var]])) != "", ]
df <- df[!is.na(df[[value_var]]), ]
removed_rows <- original_rows - nrow(df)
if (removed_rows > 0) {
log_add(glue("数据清洗: 移除 {removed_rows} 行缺失值 (剩余 {nrow(df)} 行)"))
}
# 分组检查
groups <- unique(df[[group_var]])
if (length(groups) != 2) {
return(make_error(ERROR_CODES$E003_INSUFFICIENT_GROUPS,
col = group_var, expected = 2, actual = length(groups)))
}
# 提取两组数据
g1_vals <- df[df[[group_var]] == groups[1], value_var]
g2_vals <- df[df[[group_var]] == groups[2], value_var]
# ===== 护栏检查 =====
guardrail_results <- list()
warnings_list <- c()
# 样本量检查每组至少5个
min_n <- min(length(g1_vals), length(g2_vals))
sample_check <- check_sample_size(min_n, min_required = 5, action = ACTION_BLOCK)
guardrail_results <- c(guardrail_results, list(sample_check))
log_add(glue("样本量检查: 每组最小 {min_n}, {sample_check$reason}"))
guardrail_status <- run_guardrail_chain(guardrail_results)
if (guardrail_status$status == "blocked") {
return(list(
status = "blocked",
message = guardrail_status$reason,
trace_log = logs
))
}
if (length(guardrail_status$warnings) > 0) {
warnings_list <- c(warnings_list, guardrail_status$warnings)
}
# ===== 核心计算 =====
log_add("执行 Mann-Whitney U 检验 (Wilcoxon rank-sum test)")
result <- tryCatch({
wilcox.test(g1_vals, g2_vals, exact = FALSE, correct = TRUE)
}, error = function(e) {
log_add(paste("Mann-Whitney U 检验失败:", e$message))
return(NULL)
})
if (is.null(result)) {
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "Mann-Whitney U 检验执行失败"))
}
# 计算效应量 r = Z / sqrt(N)
n1 <- length(g1_vals)
n2 <- length(g2_vals)
N <- n1 + n2
# 从 U 统计量计算 Z 值
U <- result$statistic
mu <- n1 * n2 / 2
sigma <- sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
z_value <- (U - mu) / sigma
effect_r <- abs(z_value) / sqrt(N)
# 效应量解释
effect_interpretation <- if (effect_r < 0.1) "微小" else if (effect_r < 0.3) "小" else if (effect_r < 0.5) "中等" else "大"
log_add(glue("U = {round(U, 2)}, Z = {round(z_value, 3)}, p = {round(result$p.value, 4)}, r = {round(effect_r, 3)}"))
# ===== 生成图表 =====
log_add("生成箱线图")
plot_base64 <- tryCatch({
generate_boxplot(df, group_var, value_var)
}, error = function(e) {
log_add(paste("图表生成失败:", e$message))
NULL
})
# ===== 生成可复现代码 =====
original_filename <- if (!is.null(input$original_filename) && nchar(input$original_filename) > 0) {
input$original_filename
} else {
"data.csv"
}
reproducible_code <- glue('
# SSA-Pro 自动生成代码
# 工具: Mann-Whitney U 检验
# 时间: {Sys.time()}
# ================================
library(ggplot2)
# 数据准备
df <- read.csv("{original_filename}")
group_var <- "{group_var}"
value_var <- "{value_var}"
# 数据清洗
df <- df[!is.na(df[[group_var]]) & !is.na(df[[value_var]]), ]
# Mann-Whitney U 检验
g1_vals <- df[df[[group_var]] == "{groups[1]}", value_var]
g2_vals <- df[df[[group_var]] == "{groups[2]}", value_var]
result <- wilcox.test(g1_vals, g2_vals, exact = FALSE, correct = TRUE)
print(result)
# 计算效应量 r
n1 <- length(g1_vals)
n2 <- length(g2_vals)
U <- result$statistic
mu <- n1 * n2 / 2
sigma <- sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
z_value <- (U - mu) / sigma
effect_r <- abs(z_value) / sqrt(n1 + n2)
cat("Effect size r =", round(effect_r, 3), "\\n")
# 可视化
ggplot(df, aes(x = .data[[group_var]], y = .data[[value_var]])) +
geom_boxplot(fill = "#8b5cf6", alpha = 0.6) +
theme_minimal() +
labs(title = paste("Distribution of", value_var, "by", group_var))
')
# ===== 构建 report_blocks =====
log_add("构建 report_blocks")
blocks <- list()
# Block 1: 样本概况(两组 n, median, IQR
g1_label <- as.character(groups[1])
g2_label <- as.character(groups[2])
blocks[[length(blocks) + 1]] <- make_kv_block(
title = "样本概况",
items = list(
list(key = paste0(g1_label, " (n, Median, IQR)"),
value = paste0("n=", n1, ", ", round(median(g1_vals), 3), ", ", round(IQR(g1_vals), 3))),
list(key = paste0(g2_label, " (n, Median, IQR)"),
value = paste0("n=", n2, ", ", round(median(g2_vals), 3), ", ", round(IQR(g2_vals), 3)))
)
)
# Block 2: 检验结果U 统计量, Z 值, P 值, 效应量 r
blocks[[length(blocks) + 1]] <- make_table_block(
title = "Mann-Whitney U 检验结果",
headers = c("U 统计量", "Z 值", "P 值", "效应量 r", "效应量解释"),
rows = list(
list(
round(as.numeric(U), 4),
round(z_value, 4),
format_p_value(result$p.value),
round(effect_r, 4),
effect_interpretation
)
),
footnote = "Wilcoxon rank sum test with continuity correction"
)
# Block 3: 箱线图(如果 plot_base64 不为 NULL
if (!is.null(plot_base64)) {
blocks[[length(blocks) + 1]] <- make_image_block(
base64_data = plot_base64,
title = paste0(value_var, " by ", group_var),
alt = paste("箱线图:", value_var, "按", group_var, "分组")
)
}
# Block 4: 结论摘要
sig <- if (result$p.value < 0.05) "存在统计学显著差异" else "差异无统计学意义"
blocks[[length(blocks) + 1]] <- make_markdown_block(
title = "结果摘要",
content = paste0(
"两组 **", value_var, "** 的比较Mann-Whitney U 检验):",
"U = ", round(as.numeric(U), 2),
"Z = ", round(z_value, 3),
"P ", format_p_value(result$p.value),
",效应量 r = ", round(effect_r, 3), "", effect_interpretation, ")。",
"两组间", sig, "。"
)
)
# ===== 返回结果 =====
log_add("分析完成")
return(list(
status = "success",
message = "分析完成",
warnings = if (length(warnings_list) > 0) warnings_list else NULL,
results = list(
method = "Wilcoxon rank sum test with continuity correction",
statistic_U = jsonlite::unbox(as.numeric(U)),
z_value = jsonlite::unbox(round(z_value, 4)),
p_value = jsonlite::unbox(as.numeric(result$p.value)),
p_value_fmt = format_p_value(result$p.value),
effect_size = list(
r = jsonlite::unbox(round(effect_r, 4)),
interpretation = effect_interpretation
),
group_stats = list(
list(
group = as.character(groups[1]),
n = n1,
median = median(g1_vals),
iqr = IQR(g1_vals),
min = min(g1_vals),
max = max(g1_vals)
),
list(
group = as.character(groups[2]),
n = n2,
median = median(g2_vals),
iqr = IQR(g2_vals),
min = min(g2_vals),
max = max(g2_vals)
)
)
),
report_blocks = blocks,
plots = if (!is.null(plot_base64)) list(plot_base64) else list(),
trace_log = logs,
reproducible_code = as.character(reproducible_code)
))
}
# 辅助函数:生成箱线图
generate_boxplot <- function(df, group_var, value_var) {
p <- ggplot(df, aes(x = .data[[group_var]], y = .data[[value_var]])) +
geom_boxplot(fill = "#8b5cf6", alpha = 0.6) +
geom_jitter(width = 0.2, alpha = 0.3, size = 1) +
theme_minimal() +
labs(
title = paste("Distribution of", value_var, "by", group_var),
subtitle = "Mann-Whitney U Test"
)
tmp_file <- tempfile(fileext = ".png")
ggsave(tmp_file, p, width = 6, height = 4, dpi = 100)
base64_str <- base64encode(tmp_file)
unlink(tmp_file)
return(paste0("data:image/png;base64,", base64_str))
}