Files
HaHafeng 371e1c069c feat(ssa): Complete QPER architecture - Query, Planner, Execute, Reflection layers
Implement the full QPER intelligent analysis pipeline:

- Phase E+: Block-based standardization for all 7 R tools, DynamicReport renderer, Word export enhancement

- Phase Q: LLM intent parsing with dynamic Zod validation against real column names, ClarificationCard component, DataProfile is_id_like tagging

- Phase P: ConfigLoader with Zod schema validation and hot-reload API, DecisionTableService (4-dimension matching), FlowTemplateService with EPV protection, PlannedTrace audit output

- Phase R: ReflectionService with statistical slot injection, sensitivity analysis conflict rules, ConclusionReport with section reveal animation, conclusion caching API, graceful R error classification

End-to-end test: 40/40 passed across two complete analysis scenarios.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-21 18:15:53 +08:00

286 lines
9.1 KiB
R
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#' @tool_code ST_T_TEST_IND
#' @name 独立样本 T 检验
#' @version 1.0.0
#' @description 比较两组独立样本的均值差异
#' @author SSA-Pro Team
library(glue)
library(ggplot2)
library(base64enc)
# 统一入口函数
run_analysis <- function(input) {
# ===== 初始化 =====
logs <- c()
log_add <- function(msg) { logs <<- c(logs, paste0("[", Sys.time(), "] ", msg)) }
tmp_files <- c()
# 确保退出时清理临时文件
on.exit({
if (length(tmp_files) > 0) {
unlink(tmp_files)
}
}, add = TRUE)
# ===== 数据加载 =====
log_add("开始加载输入数据")
df <- tryCatch(
load_input_data(input),
error = function(e) {
log_add(paste("数据加载失败:", e$message))
return(NULL)
}
)
if (is.null(df)) {
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "数据加载失败"))
}
log_add(glue("数据加载成功: {nrow(df)} 行, {ncol(df)} 列"))
p <- input$params
guardrails_cfg <- input$guardrails
group_var <- p$group_var
value_var <- p$value_var
# ===== 参数校验 =====
if (!(group_var %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = group_var))
}
if (!(value_var %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = value_var))
}
# ===== 数据清洗:移除分组变量或数值变量中的缺失值 =====
original_rows <- nrow(df)
# 处理分组变量:移除 NA、空字符串、纯空白字符
df <- df[!is.na(df[[group_var]]) & trimws(as.character(df[[group_var]])) != "", ]
# 处理数值变量:移除 NA
df <- df[!is.na(df[[value_var]]), ]
removed_rows <- original_rows - nrow(df)
if (removed_rows > 0) {
log_add(glue("数据清洗: 移除 {removed_rows} 行缺失值 (剩余 {nrow(df)} 行)"))
}
if (nrow(df) < 6) {
return(make_error(ERROR_CODES$E004_SAMPLE_TOO_SMALL,
n = nrow(df), min_required = 6))
}
groups <- unique(df[[group_var]])
if (length(groups) != 2) {
return(make_error(ERROR_CODES$E003_INSUFFICIENT_GROUPS,
col = group_var, expected = 2, actual = length(groups)))
}
# ===== 护栏检查 =====
guardrail_results <- list()
method_used <- "t.test"
warnings_list <- c()
# 样本量检查
g1_vals <- df[df[[group_var]] == groups[1], value_var]
g2_vals <- df[df[[group_var]] == groups[2], value_var]
sample_check <- check_sample_size(min(length(g1_vals), length(g2_vals)),
min_required = 3,
action = ACTION_BLOCK)
guardrail_results <- c(guardrail_results, list(sample_check))
log_add(glue("样本量检查: {sample_check$reason}"))
# 正态性检验
if (isTRUE(guardrails_cfg$check_normality)) {
log_add("执行正态性检验")
for (g in groups) {
vals <- df[df[[group_var]] == g, value_var]
norm_check <- check_normality(vals,
alpha = 0.05,
action = ACTION_SWITCH,
action_target = "ST_MANN_WHITNEY")
guardrail_results <- c(guardrail_results, list(norm_check))
log_add(glue("组[{g}] 正态性检验: p = {round(norm_check$p_value, 4)}, {norm_check$reason}"))
}
}
# 执行护栏链
guardrail_status <- run_guardrail_chain(guardrail_results)
if (guardrail_status$status == "blocked") {
return(list(
status = "blocked",
message = guardrail_status$reason,
trace_log = logs
))
}
if (guardrail_status$status == "switch") {
log_add(glue("触发方法切换: {guardrail_status$reason} -> {guardrail_status$target_tool}"))
# TODO: 调用备选方法
# 目前先继续执行 T 检验,但添加警告
warnings_list <- c(warnings_list, guardrail_status$reason)
}
if (length(guardrail_status$warnings) > 0) {
warnings_list <- c(warnings_list, guardrail_status$warnings)
}
# ===== 核心计算 =====
log_add("执行 T 检验")
result <- t.test(g1_vals, g2_vals, var.equal = FALSE)
# ===== 生成图表 =====
log_add("生成箱线图")
plot_base64 <- tryCatch({
generate_boxplot(df, group_var, value_var, tmp_files)
}, error = function(e) {
log_add(paste("图表生成失败:", e$message))
NULL
})
# ===== 生成可复现代码 =====
# 获取原始文件名(如果未提供则使用默认值)
original_filename <- if (!is.null(input$original_filename) && nchar(input$original_filename) > 0) {
input$original_filename
} else {
"data.csv"
}
reproducible_code <- glue('
# SSA-Pro 自动生成代码
# 工具: 独立样本 T 检验
# 时间: {Sys.time()}
# ================================
# 自动安装依赖
required_packages <- c("ggplot2")
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages, repos = "https://cloud.r-project.org")
library(ggplot2)
# 数据准备
df <- read.csv("{original_filename}")
group_var <- "{group_var}"
value_var <- "{value_var}"
# 独立样本 T 检验 (Welch)
g1_vals <- df[df[[group_var]] == "{groups[1]}", value_var]
g2_vals <- df[df[[group_var]] == "{groups[2]}", value_var]
result <- t.test(g1_vals, g2_vals, var.equal = FALSE)
print(result)
# 可视化
ggplot(df, aes(x = .data[[group_var]], y = .data[[value_var]])) +
geom_boxplot(fill = "#3b82f6", alpha = 0.6) +
theme_minimal() +
labs(title = paste("Distribution of", value_var, "by", group_var))
')
# ===== 构建 report_blocks =====
log_add("构建 report_blocks")
blocks <- list()
# Block 1: 描述统计键值对
g1_label <- as.character(groups[1])
g2_label <- as.character(groups[2])
blocks[[length(blocks) + 1]] <- make_kv_block(
title = "样本概况",
items = list(
list(key = paste0(group_var, " = ", g1_label, " (n)"), value = as.character(length(g1_vals))),
list(key = paste0(group_var, " = ", g2_label, " (n)"), value = as.character(length(g2_vals))),
list(key = paste0(g1_label, " Mean ± SD"),
value = paste0(round(mean(g1_vals), 3), " \u00b1 ", round(sd(g1_vals), 3))),
list(key = paste0(g2_label, " Mean ± SD"),
value = paste0(round(mean(g2_vals), 3), " \u00b1 ", round(sd(g2_vals), 3)))
)
)
# Block 2: 检验结果表格
blocks[[length(blocks) + 1]] <- make_table_block(
title = "独立样本 T 检验结果",
headers = c("统计量", "自由度", "P 值", "95% CI 下限", "95% CI 上限", "均值差"),
rows = list(
list(
round(as.numeric(result$statistic), 4),
round(as.numeric(result$parameter), 2),
format_p_value(result$p.value),
round(result$conf.int[1], 4),
round(result$conf.int[2], 4),
round(diff(result$estimate), 4)
)
),
footnote = result$method
)
# Block 3: 箱线图
if (!is.null(plot_base64)) {
blocks[[length(blocks) + 1]] <- make_image_block(
base64_data = plot_base64,
title = paste0(value_var, " by ", group_var),
alt = paste("箱线图:", value_var, "按", group_var, "分组")
)
}
# Block 4: 结论摘要
sig <- if (result$p.value < 0.05) "存在统计学显著差异" else "差异无统计学意义"
blocks[[length(blocks) + 1]] <- make_markdown_block(
title = "结果摘要",
content = paste0(
"两组 **", value_var, "** 的比较(", result$method, "",
"t = ", round(as.numeric(result$statistic), 3),
"df = ", round(as.numeric(result$parameter), 1),
"P ", format_p_value(result$p.value),
"。两组间", sig, "。"
)
)
# ===== 返回结果 =====
log_add("分析完成")
return(list(
status = "success",
message = "分析完成",
warnings = if (length(warnings_list) > 0) warnings_list else NULL,
results = list(
method = result$method,
statistic = jsonlite::unbox(as.numeric(result$statistic)),
df = jsonlite::unbox(as.numeric(result$parameter)),
p_value = jsonlite::unbox(as.numeric(result$p.value)),
p_value_fmt = format_p_value(result$p.value),
conf_int = as.numeric(result$conf.int),
estimate = as.numeric(result$estimate),
group_stats = list(
list(group = g1_label, n = length(g1_vals), mean = mean(g1_vals), sd = sd(g1_vals)),
list(group = g2_label, n = length(g2_vals), mean = mean(g2_vals), sd = sd(g2_vals))
)
),
report_blocks = blocks,
plots = if (!is.null(plot_base64)) list(plot_base64) else list(),
trace_log = logs,
reproducible_code = as.character(reproducible_code)
))
}
# 辅助函数:生成箱线图
generate_boxplot <- function(df, group_var, value_var, tmp_files_ref) {
p <- ggplot(df, aes(x = .data[[group_var]], y = .data[[value_var]])) +
geom_boxplot(fill = "#3b82f6", alpha = 0.6) +
theme_minimal() +
labs(title = paste("Distribution of", value_var, "by", group_var))
tmp_file <- tempfile(fileext = ".png")
ggsave(tmp_file, p, width = 6, height = 4, dpi = 100)
base64_str <- base64encode(tmp_file)
unlink(tmp_file)
return(paste0("data:image/png;base64,", base64_str))
}