feat(ssa): Complete T-test end-to-end testing with 9 bug fixes - Phase 1 core 85% complete. R service: missing value auto-filter. Backend: error handling, variable matching, dynamic filename. Frontend: module activation, session isolation, error propagation. Full flow verified.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2026-02-19 20:57:00 +08:00
parent 8137e3cde2
commit 49b5c37cb1
86 changed files with 21207 additions and 252 deletions

View File

@@ -0,0 +1,128 @@
# utils/data_loader.R
# 混合数据协议:自动识别 inline 数据 vs 预签名 URL
#
# 架构说明:
# - R 服务不持有 OSS 密钥,遵循平台 OSS 存储规范
# - Node.js 后端通过 storage.getUrl() 生成预签名 URL
# - R 服务直接访问预签名 URL 下载数据
# - 开发环境使用 ai-clinical-data-dev bucket无需 Mock
library(httr)
library(jsonlite)
library(glue)
# 统一数据加载入口
load_input_data <- function(input) {
# 检查输入结构
if (is.null(input$data_source)) {
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
details = "请求缺少 data_source 字段"))
}
source_type <- input$data_source$type # "inline" | "oss"
if (source_type == "inline") {
# 方式1内联 JSON 数据(< 2MB
message("[DataLoader] 使用 inline 数据模式")
raw_data <- input$data_source$data
# 调试:打印原始数据结构
message(glue("[DataLoader] 原始数据类型: {class(raw_data)}"))
message(glue("[DataLoader] 原始数据字段: {paste(names(raw_data), collapse=', ')}"))
# 安全转换:处理不同的 JSON 解析结果
if (is.data.frame(raw_data)) {
df <- raw_data
} else if (is.list(raw_data)) {
# JSON 对象 {"col1": [...], "col2": [...]} -> data.frame
# JSON 数组可能被解析为 list 而非 vector需要先 unlist
df <- data.frame(
lapply(raw_data, function(x) {
if (is.list(x)) unlist(x) else x
}),
stringsAsFactors = FALSE
)
} else {
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
details = paste("无法解析的数据类型:", class(raw_data))))
}
message(glue("[DataLoader] 转换后: {nrow(df)} 行, {ncol(df)} 列, 列名: {paste(names(df), collapse=', ')}"))
return(df)
} else if (source_type == "oss") {
# 方式2从预签名 URL 下载2MB - 20MB
# 注意oss_url 是由 Node.js 后端生成的预签名 URL不是 oss_key
oss_url <- input$data_source$oss_url
if (is.null(oss_url) || oss_url == "") {
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
details = "OSS 模式缺少 oss_url 字段"))
}
return(load_from_signed_url(oss_url))
} else {
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
details = paste("未知的 data_source.type:", source_type)))
}
}
# 从预签名 URL 下载数据
#
# @param url 预签名 URL由 Node.js storage.getUrl() 生成)
# @return data.frame
#
# 说明:开发环境和生产环境都使用真实 OSS
# - 开发环境ai-clinical-data-dev bucket
# - 生产环境ai-clinical-data bucket
load_from_signed_url <- function(url) {
message(glue("[DataLoader] 从预签名 URL 下载数据"))
temp_file <- tempfile(fileext = ".csv")
on.exit(unlink(temp_file))
tryCatch({
# 预签名 URL 自带认证信息,直接 GET 即可
response <- GET(url, write_disk(temp_file, overwrite = TRUE))
status <- status_code(response)
if (status != 200) {
# 403 通常表示签名过期
if (status == 403) {
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
details = "预签名 URL 已过期,请重新上传数据"))
}
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
details = paste("OSS 下载失败HTTP 状态码:", status)))
}
# 检测文件类型并读取
content_type <- headers(response)$`content-type`
if (grepl("csv", content_type, ignore.case = TRUE) ||
grepl("\\.csv", url, ignore.case = TRUE)) {
return(read.csv(temp_file, stringsAsFactors = FALSE))
} else if (grepl("excel|xlsx", content_type, ignore.case = TRUE) ||
grepl("\\.xlsx?", url, ignore.case = TRUE)) {
# 需要 readxl 包
if (!requireNamespace("readxl", quietly = TRUE)) {
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
details = "Excel 文件需要 readxl 包"))
}
return(as.data.frame(readxl::read_excel(temp_file)))
} else {
# 默认尝试 CSV
return(read.csv(temp_file, stringsAsFactors = FALSE))
}
}, error = function(e) {
if (grepl("make_error", deparse(e$call))) {
stop(e) # 重新抛出已格式化的错误
}
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
details = paste("OSS 网络错误:", e$message)))
})
}

View File

@@ -0,0 +1,99 @@
# utils/error_codes.R
# 结构化错误码,便于 LLM 自愈
ERROR_CODES <- list(
# 业务错误(可被 Planner 修复)
E001_COLUMN_NOT_FOUND = list(
code = "E001",
type = "business",
message_template = "列名 '{col}' 在数据中不存在",
user_hint = "请检查变量名是否拼写正确"
),
E002_TYPE_MISMATCH = list(
code = "E002",
type = "business",
message_template = "列 '{col}' 类型应为 {expected},实际为 {actual}",
user_hint = "该列包含非数值数据,请检查数据格式"
),
E003_INSUFFICIENT_GROUPS = list(
code = "E003",
type = "business",
message_template = "分组变量 '{col}' 应有 {expected} 个水平,实际有 {actual} 个",
user_hint = "分组变量的取值个数不符合要求"
),
E004_SAMPLE_TOO_SMALL = list(
code = "E004",
type = "business",
message_template = "样本量 {n} 不足,至少需要 {min_required}",
user_hint = "数据量太少,无法进行统计分析"
),
# 统计计算错误(用户友好映射)
E005_SINGULAR_MATRIX = list(
code = "E005",
type = "business",
message_template = "矩阵计算异常: {details}",
user_hint = "变量之间可能存在多重共线性,建议移除高度相关的变量"
),
E006_CONVERGENCE_FAILED = list(
code = "E006",
type = "business",
message_template = "模型未能收敛: {details}",
user_hint = "算法未能找到稳定解,可能需要调整参数或检查数据"
),
E007_VARIANCE_ZERO = list(
code = "E007",
type = "business",
message_template = "变量 '{col}' 方差为零",
user_hint = "该列的所有值都相同,无法进行比较"
),
# 系统错误(需人工介入)
E100_INTERNAL_ERROR = list(
code = "E100",
type = "system",
message_template = "内部错误: {details}",
user_hint = "系统繁忙,请稍后重试"
),
E101_PACKAGE_MISSING = list(
code = "E101",
type = "system",
message_template = "缺少依赖包: {package}",
user_hint = "请联系管理员"
)
)
# R 原始错误到错误码的映射字典
R_ERROR_MAPPING <- list(
"system is computationally singular" = "E005_SINGULAR_MATRIX",
"did not converge" = "E006_CONVERGENCE_FAILED",
"constant" = "E007_VARIANCE_ZERO"
)
# 构造错误响应(含用户友好提示)
make_error <- function(error_def, ...) {
params <- list(...)
msg <- error_def$message_template
for (name in names(params)) {
msg <- gsub(paste0("\\{", name, "\\}"), as.character(params[[name]]), msg)
}
return(list(
status = "error",
error_code = error_def$code,
error_type = error_def$type,
message = msg,
user_hint = error_def$user_hint
))
}
# 尝试将 R 原始错误映射为友好错误码
map_r_error <- function(raw_error_msg) {
for (pattern in names(R_ERROR_MAPPING)) {
if (grepl(pattern, raw_error_msg, ignore.case = TRUE)) {
error_key <- R_ERROR_MAPPING[[pattern]]
return(make_error(ERROR_CODES[[error_key]], details = raw_error_msg))
}
}
# 无法映射,返回通用内部错误
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = raw_error_msg))
}

View File

@@ -0,0 +1,116 @@
# utils/guardrails.R
# 统计护栏函数库
library(glue)
# 大样本优化阈值
LARGE_SAMPLE_THRESHOLD <- 5000
# 护栏 Action 类型
ACTION_BLOCK <- "Block" # 阻止执行
ACTION_WARN <- "Warn" # 警告但继续
ACTION_SWITCH <- "Switch" # 切换到备选方法
# 正态性检验(支持三种 Action
check_normality <- function(values, alpha = 0.05, action = ACTION_SWITCH, action_target = NULL) {
n <- length(values)
# 样本量过小
if (n < 3) {
return(list(
passed = TRUE,
action = NULL,
action_target = NULL,
reason = "样本量过小,跳过正态性检验",
skipped = TRUE
))
}
# 大样本优化N > 5000 时使用抽样检验
if (n > LARGE_SAMPLE_THRESHOLD) {
set.seed(42)
sampled_values <- sample(values, 1000)
test <- shapiro.test(sampled_values)
passed <- test$p.value >= alpha
return(list(
passed = passed,
action = if (passed) NULL else action,
action_target = if (passed) NULL else action_target,
p_value = test$p.value,
reason = glue("大样本(N={n})抽样检验,{if (passed) '满足正态性' else '不满足正态性'}"),
sampled = TRUE,
sample_size = 1000
))
}
# 常规检验
test <- shapiro.test(values)
passed <- test$p.value >= alpha
return(list(
passed = passed,
action = if (passed) NULL else action,
action_target = if (passed) NULL else action_target,
p_value = test$p.value,
reason = if (passed) "满足正态性" else "不满足正态性",
sampled = FALSE
))
}
# 方差齐性检验 (Levene)
check_homogeneity <- function(df, group_var, value_var, alpha = 0.05, action = ACTION_WARN) {
library(car)
formula <- as.formula(paste(value_var, "~", group_var))
test <- leveneTest(formula, data = df)
p_val <- test$`Pr(>F)`[1]
passed <- p_val >= alpha
return(list(
passed = passed,
action = if (passed) NULL else action,
p_value = p_val,
reason = if (passed) "方差齐性满足" else "方差不齐性"
))
}
# 样本量检验
check_sample_size <- function(n, min_required = 3, action = ACTION_BLOCK) {
passed <- n >= min_required
return(list(
passed = passed,
action = if (passed) NULL else action,
n = n,
reason = if (passed) "样本量充足" else paste0("样本量不足, 需要至少 ", min_required)
))
}
# 执行护栏链(按 check_order 顺序执行)
run_guardrail_chain <- function(guardrail_results) {
warnings <- c()
for (result in guardrail_results) {
if (!result$passed) {
if (result$action == ACTION_BLOCK) {
return(list(
status = "blocked",
reason = result$reason
))
} else if (result$action == ACTION_SWITCH) {
return(list(
status = "switch",
target_tool = result$action_target,
reason = result$reason
))
} else if (result$action == ACTION_WARN) {
warnings <- c(warnings, result$reason)
}
}
}
return(list(
status = "passed",
warnings = warnings
))
}

View File

@@ -0,0 +1,44 @@
# utils/result_formatter.R
# 统计结果格式化,确保 p 值显示规范
# 格式化 p 值(符合 APA 规范)
format_p_value <- function(p) {
if (is.na(p)) return(NA)
if (p < 0.001) {
return("< 0.001")
} else {
return(sprintf("%.3f", p))
}
}
# 构建标准化结果(包含 p_value_fmt
make_result <- function(p_value, statistic, method, ...) {
list(
p_value = p_value,
p_value_fmt = format_p_value(p_value),
statistic = statistic,
method = method,
...
)
}
# 格式化置信区间
format_ci <- function(lower, upper, digits = 2) {
sprintf("[%.2f, %.2f]", lower, upper)
}
# 格式化效应量
format_effect_size <- function(value, type = "d") {
interpretation <- ""
if (type == "d") { # Cohen's d
if (abs(value) < 0.2) interpretation <- "微小"
else if (abs(value) < 0.5) interpretation <- "小"
else if (abs(value) < 0.8) interpretation <- "中等"
else interpretation <- "大"
}
list(
value = round(value, 3),
interpretation = interpretation
)
}