Files
AIclinicalresearch/r-statistics-service/tools/t_test_paired.R
HaHafeng 371e1c069c feat(ssa): Complete QPER architecture - Query, Planner, Execute, Reflection layers
Implement the full QPER intelligent analysis pipeline:

- Phase E+: Block-based standardization for all 7 R tools, DynamicReport renderer, Word export enhancement

- Phase Q: LLM intent parsing with dynamic Zod validation against real column names, ClarificationCard component, DataProfile is_id_like tagging

- Phase P: ConfigLoader with Zod schema validation and hot-reload API, DecisionTableService (4-dimension matching), FlowTemplateService with EPV protection, PlannedTrace audit output

- Phase R: ReflectionService with statistical slot injection, sensitivity analysis conflict rules, ConclusionReport with section reveal animation, conclusion caching API, graceful R error classification

End-to-end test: 40/40 passed across two complete analysis scenarios.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-21 18:15:53 +08:00

367 lines
12 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#' @tool_code ST_T_TEST_PAIRED
#' @name 配对 T 检验
#' @version 1.0.0
#' @description 配对设计的均值差异检验(前后对比)
#' @author SSA-Pro Team
library(glue)
library(ggplot2)
library(base64enc)
run_analysis <- function(input) {
# ===== 初始化 =====
logs <- c()
log_add <- function(msg) { logs <<- c(logs, paste0("[", Sys.time(), "] ", msg)) }
on.exit({}, add = TRUE)
# ===== 数据加载 =====
log_add("开始加载输入数据")
df <- tryCatch(
load_input_data(input),
error = function(e) {
log_add(paste("数据加载失败:", e$message))
return(NULL)
}
)
if (is.null(df)) {
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "数据加载失败"))
}
log_add(glue("数据加载成功: {nrow(df)} 行, {ncol(df)} 列"))
p <- input$params
guardrails_cfg <- input$guardrails
before_var <- p$before_var
after_var <- p$after_var
# ===== 参数校验 =====
if (!(before_var %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = before_var))
}
if (!(after_var %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = after_var))
}
# ===== 数据清洗 =====
original_rows <- nrow(df)
df <- df[!is.na(df[[before_var]]) & !is.na(df[[after_var]]), ]
removed_rows <- original_rows - nrow(df)
if (removed_rows > 0) {
log_add(glue("数据清洗: 移除 {removed_rows} 行缺失值 (剩余 {nrow(df)} 行)"))
}
before_vals <- df[[before_var]]
after_vals <- df[[after_var]]
diff_vals <- after_vals - before_vals
n <- length(diff_vals)
# ===== 护栏检查 =====
guardrail_results <- list()
warnings_list <- c()
method_used <- "t.test"
use_wilcoxon <- FALSE
# 样本量检查
sample_check <- check_sample_size(n, min_required = 10, action = ACTION_WARN)
guardrail_results <- c(guardrail_results, list(sample_check))
log_add(glue("样本量检查: N = {n}, {sample_check$reason}"))
# 差值正态性检验
if (isTRUE(guardrails_cfg$check_normality) && n >= 3) {
log_add("执行差值正态性检验")
norm_check <- check_normality(diff_vals, alpha = 0.05,
action = ACTION_SWITCH,
action_target = "Wilcoxon signed-rank test")
guardrail_results <- c(guardrail_results, list(norm_check))
log_add(glue("差值正态性: p = {round(norm_check$p_value, 4)}, {norm_check$reason}"))
}
guardrail_status <- run_guardrail_chain(guardrail_results)
if (guardrail_status$status == "blocked") {
return(list(
status = "blocked",
message = guardrail_status$reason,
trace_log = logs
))
}
if (guardrail_status$status == "switch") {
use_wilcoxon <- TRUE
log_add(glue("触发方法切换: {guardrail_status$reason}"))
warnings_list <- c(warnings_list, "差值不满足正态性,自动切换为 Wilcoxon 符号秩检验")
}
if (length(guardrail_status$warnings) > 0) {
warnings_list <- c(warnings_list, guardrail_status$warnings)
}
# ===== 核心计算 =====
if (use_wilcoxon) {
log_add("执行 Wilcoxon 符号秩检验")
result <- wilcox.test(before_vals, after_vals, paired = TRUE, exact = FALSE)
method_used <- "Wilcoxon signed rank test"
# Wilcoxon 效应量 r
z_value <- qnorm(result$p.value / 2) * sign(median(diff_vals))
effect_r <- abs(z_value) / sqrt(n)
effect_interpretation <- if (abs(effect_r) < 0.1) "微小" else if (abs(effect_r) < 0.3) "小" else if (abs(effect_r) < 0.5) "中等" else "大"
output_results <- list(
method = method_used,
statistic = jsonlite::unbox(as.numeric(result$statistic)),
p_value = jsonlite::unbox(as.numeric(result$p.value)),
p_value_fmt = format_p_value(result$p.value),
effect_size = list(
r = jsonlite::unbox(round(effect_r, 4)),
interpretation = effect_interpretation
)
)
} else {
log_add("执行配对 T 检验")
result <- t.test(before_vals, after_vals, paired = TRUE)
method_used <- "Paired t-test"
# Cohen's d for paired samples
mean_diff <- mean(diff_vals)
sd_diff <- sd(diff_vals)
cohens_d <- mean_diff / sd_diff
effect_interpretation <- if (abs(cohens_d) < 0.2) "微小" else if (abs(cohens_d) < 0.5) "小" else if (abs(cohens_d) < 0.8) "中等" else "大"
log_add(glue("t = {round(result$statistic, 3)}, df = {round(result$parameter, 1)}, p = {round(result$p.value, 4)}, Cohen's d = {round(cohens_d, 3)}"))
output_results <- list(
method = method_used,
statistic = jsonlite::unbox(as.numeric(result$statistic)),
df = jsonlite::unbox(as.numeric(result$parameter)),
p_value = jsonlite::unbox(as.numeric(result$p.value)),
p_value_fmt = format_p_value(result$p.value),
conf_int = as.numeric(result$conf.int),
effect_size = list(
cohens_d = jsonlite::unbox(round(cohens_d, 4)),
interpretation = effect_interpretation
)
)
}
# 添加描述性统计
output_results$descriptive <- list(
before = list(
variable = before_var,
n = n,
mean = round(mean(before_vals), 3),
sd = round(sd(before_vals), 3),
median = round(median(before_vals), 3)
),
after = list(
variable = after_var,
n = n,
mean = round(mean(after_vals), 3),
sd = round(sd(after_vals), 3),
median = round(median(after_vals), 3)
),
difference = list(
mean = round(mean(diff_vals), 3),
sd = round(sd(diff_vals), 3),
median = round(median(diff_vals), 3)
)
)
# ===== 生成图表 =====
log_add("生成配对比较图")
plot_base64 <- tryCatch({
generate_paired_plot(df, before_var, after_var, diff_vals)
}, error = function(e) {
log_add(paste("图表生成失败:", e$message))
NULL
})
# ===== 生成可复现代码 =====
original_filename <- if (!is.null(input$original_filename) && nchar(input$original_filename) > 0) {
input$original_filename
} else {
"data.csv"
}
reproducible_code <- glue('
# SSA-Pro 自动生成代码
# 工具: 配对 T 检验
# 时间: {Sys.time()}
# ================================
library(ggplot2)
# 数据准备
df <- read.csv("{original_filename}")
before_var <- "{before_var}"
after_var <- "{after_var}"
# 数据清洗
df <- df[!is.na(df[[before_var]]) & !is.na(df[[after_var]]), ]
# 配对 T 检验
before_vals <- df[[before_var]]
after_vals <- df[[after_var]]
result <- t.test(before_vals, after_vals, paired = TRUE)
print(result)
# Cohen d (效应量)
diff_vals <- after_vals - before_vals
cohens_d <- mean(diff_vals) / sd(diff_vals)
cat("Cohen d =", round(cohens_d, 3), "\\n")
# 可视化
df_long <- data.frame(
id = rep(1:nrow(df), 2),
time = rep(c("Before", "After"), each = nrow(df)),
value = c(before_vals, after_vals)
)
ggplot(df_long, aes(x = time, y = value, group = id)) +
geom_line(alpha = 0.3) +
geom_point() +
theme_minimal() +
labs(title = "Paired Comparison")
')
# ===== 构建 report_blocks =====
d <- output_results$descriptive
blocks <- list()
# Block 1: 样本概况
blocks[[length(blocks) + 1]] <- make_kv_block(
title = "样本概况",
items = list(
list(key = paste0(before_var, " (n)"), value = as.character(d$before$n)),
list(key = paste0(before_var, " Mean"), value = as.character(d$before$mean)),
list(key = paste0(before_var, " SD"), value = as.character(d$before$sd)),
list(key = paste0(before_var, " Median"), value = as.character(d$before$median)),
list(key = paste0(after_var, " (n)"), value = as.character(d$after$n)),
list(key = paste0(after_var, " Mean"), value = as.character(d$after$mean)),
list(key = paste0(after_var, " SD"), value = as.character(d$after$sd)),
list(key = paste0(after_var, " Median"), value = as.character(d$after$median)),
list(key = "差值 Mean", value = as.character(d$difference$mean)),
list(key = "差值 SD", value = as.character(d$difference$sd))
)
)
# Block 2: 检验结果表格(根据 use_wilcoxon 区分)
if (use_wilcoxon) {
blocks[[length(blocks) + 1]] <- make_table_block(
title = "Wilcoxon 符号秩检验结果",
headers = c("统计量 V", "P 值", "效应量 r", "效应量解释"),
rows = list(list(
round(as.numeric(output_results$statistic), 4),
format_p_value(output_results$p_value),
round(output_results$effect_size$r, 4),
output_results$effect_size$interpretation
)),
footnote = method_used
)
} else {
ci_str <- if (length(output_results$conf_int) >= 2) {
sprintf("[%.4f, %.4f]", output_results$conf_int[1], output_results$conf_int[2])
} else {
"—"
}
blocks[[length(blocks) + 1]] <- make_table_block(
title = "配对 T 检验结果",
headers = c("t", "df", "P 值", "95% CI", "Cohen's d", "效应量解释"),
rows = list(list(
round(as.numeric(output_results$statistic), 4),
round(as.numeric(output_results$df), 2),
format_p_value(output_results$p_value),
ci_str,
round(output_results$effect_size$cohens_d, 4),
output_results$effect_size$interpretation
)),
footnote = method_used
)
}
# Block 3: 配对比较图
if (!is.null(plot_base64)) {
blocks[[length(blocks) + 1]] <- make_image_block(
base64_data = plot_base64,
title = paste0("配对比较: ", before_var, " vs ", after_var),
alt = paste("配对比较图:", before_var, "与", after_var)
)
}
# Block 4: 结论摘要
sig <- if (output_results$p_value < 0.05) "存在统计学显著差异" else "差异无统计学意义"
if (use_wilcoxon) {
concl <- paste0(
"配对样本 **", before_var, "** 与 **", after_var, "** 的比较(", method_used, "",
"V = ", round(as.numeric(output_results$statistic), 3),
"P ", format_p_value(output_results$p_value),
",效应量 r = ", round(output_results$effect_size$r, 3),
"", output_results$effect_size$interpretation, ")。",
sig, "。"
)
} else {
concl <- paste0(
"配对样本 **", before_var, "** 与 **", after_var, "** 的比较(", method_used, "",
"t = ", round(as.numeric(output_results$statistic), 3),
"df = ", round(as.numeric(output_results$df), 1),
"P ", format_p_value(output_results$p_value),
"Cohen's d = ", round(output_results$effect_size$cohens_d, 3),
"", output_results$effect_size$interpretation, ")。",
sig, "。"
)
}
blocks[[length(blocks) + 1]] <- make_markdown_block(
title = "结果摘要",
content = concl
)
# ===== 返回结果 =====
log_add("分析完成")
return(list(
status = "success",
message = "分析完成",
warnings = if (length(warnings_list) > 0) warnings_list else NULL,
results = output_results,
report_blocks = blocks,
plots = if (!is.null(plot_base64)) list(plot_base64) else list(),
trace_log = logs,
reproducible_code = as.character(reproducible_code)
))
}
# 辅助函数:生成配对比较图
generate_paired_plot <- function(df, before_var, after_var, diff_vals) {
n <- nrow(df)
# 创建长格式数据
df_long <- data.frame(
id = rep(1:n, 2),
time = factor(rep(c("Before", "After"), each = n), levels = c("Before", "After")),
value = c(df[[before_var]], df[[after_var]])
)
p <- ggplot(df_long, aes(x = time, y = value)) +
geom_line(aes(group = id), alpha = 0.3, color = "gray60") +
geom_point(aes(group = id), alpha = 0.5, size = 2) +
stat_summary(fun = mean, geom = "point", size = 4, color = "#ef4444", shape = 18) +
stat_summary(fun = mean, geom = "line", aes(group = 1), color = "#ef4444", size = 1.2) +
theme_minimal() +
labs(
title = paste("Paired Comparison:", before_var, "vs", after_var),
subtitle = paste("n =", n, ", Mean change =", round(mean(diff_vals), 2)),
x = "Time Point",
y = "Value"
)
tmp_file <- tempfile(fileext = ".png")
ggsave(tmp_file, p, width = 6, height = 5, dpi = 100)
base64_str <- base64encode(tmp_file)
unlink(tmp_file)
return(paste0("data:image/png;base64,", base64_str))
}