feat(ssa): Complete T-test end-to-end testing with 9 bug fixes - Phase 1 core 85% complete. R service: missing value auto-filter. Backend: error handling, variable matching, dynamic filename. Frontend: module activation, session isolation, error propagation. Full flow verified.
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
128
r-statistics-service/utils/data_loader.R
Normal file
128
r-statistics-service/utils/data_loader.R
Normal file
@@ -0,0 +1,128 @@
|
||||
# utils/data_loader.R
|
||||
# 混合数据协议:自动识别 inline 数据 vs 预签名 URL
|
||||
#
|
||||
# 架构说明:
|
||||
# - R 服务不持有 OSS 密钥,遵循平台 OSS 存储规范
|
||||
# - Node.js 后端通过 storage.getUrl() 生成预签名 URL
|
||||
# - R 服务直接访问预签名 URL 下载数据
|
||||
# - 开发环境使用 ai-clinical-data-dev bucket,无需 Mock
|
||||
|
||||
library(httr)
|
||||
library(jsonlite)
|
||||
library(glue)
|
||||
|
||||
# 统一数据加载入口
|
||||
load_input_data <- function(input) {
|
||||
# 检查输入结构
|
||||
if (is.null(input$data_source)) {
|
||||
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
|
||||
details = "请求缺少 data_source 字段"))
|
||||
}
|
||||
|
||||
source_type <- input$data_source$type # "inline" | "oss"
|
||||
|
||||
if (source_type == "inline") {
|
||||
# 方式1:内联 JSON 数据(< 2MB)
|
||||
message("[DataLoader] 使用 inline 数据模式")
|
||||
|
||||
raw_data <- input$data_source$data
|
||||
|
||||
# 调试:打印原始数据结构
|
||||
message(glue("[DataLoader] 原始数据类型: {class(raw_data)}"))
|
||||
message(glue("[DataLoader] 原始数据字段: {paste(names(raw_data), collapse=', ')}"))
|
||||
|
||||
# 安全转换:处理不同的 JSON 解析结果
|
||||
if (is.data.frame(raw_data)) {
|
||||
df <- raw_data
|
||||
} else if (is.list(raw_data)) {
|
||||
# JSON 对象 {"col1": [...], "col2": [...]} -> data.frame
|
||||
# JSON 数组可能被解析为 list 而非 vector,需要先 unlist
|
||||
df <- data.frame(
|
||||
lapply(raw_data, function(x) {
|
||||
if (is.list(x)) unlist(x) else x
|
||||
}),
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
} else {
|
||||
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
|
||||
details = paste("无法解析的数据类型:", class(raw_data))))
|
||||
}
|
||||
|
||||
message(glue("[DataLoader] 转换后: {nrow(df)} 行, {ncol(df)} 列, 列名: {paste(names(df), collapse=', ')}"))
|
||||
return(df)
|
||||
|
||||
} else if (source_type == "oss") {
|
||||
# 方式2:从预签名 URL 下载(2MB - 20MB)
|
||||
# 注意:oss_url 是由 Node.js 后端生成的预签名 URL,不是 oss_key
|
||||
oss_url <- input$data_source$oss_url
|
||||
|
||||
if (is.null(oss_url) || oss_url == "") {
|
||||
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
|
||||
details = "OSS 模式缺少 oss_url 字段"))
|
||||
}
|
||||
|
||||
return(load_from_signed_url(oss_url))
|
||||
|
||||
} else {
|
||||
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
|
||||
details = paste("未知的 data_source.type:", source_type)))
|
||||
}
|
||||
}
|
||||
|
||||
# 从预签名 URL 下载数据
|
||||
#
|
||||
# @param url 预签名 URL(由 Node.js storage.getUrl() 生成)
|
||||
# @return data.frame
|
||||
#
|
||||
# 说明:开发环境和生产环境都使用真实 OSS
|
||||
# - 开发环境:ai-clinical-data-dev bucket
|
||||
# - 生产环境:ai-clinical-data bucket
|
||||
load_from_signed_url <- function(url) {
|
||||
|
||||
message(glue("[DataLoader] 从预签名 URL 下载数据"))
|
||||
|
||||
temp_file <- tempfile(fileext = ".csv")
|
||||
on.exit(unlink(temp_file))
|
||||
|
||||
tryCatch({
|
||||
# 预签名 URL 自带认证信息,直接 GET 即可
|
||||
response <- GET(url, write_disk(temp_file, overwrite = TRUE))
|
||||
|
||||
status <- status_code(response)
|
||||
if (status != 200) {
|
||||
# 403 通常表示签名过期
|
||||
if (status == 403) {
|
||||
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
|
||||
details = "预签名 URL 已过期,请重新上传数据"))
|
||||
}
|
||||
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
|
||||
details = paste("OSS 下载失败,HTTP 状态码:", status)))
|
||||
}
|
||||
|
||||
# 检测文件类型并读取
|
||||
content_type <- headers(response)$`content-type`
|
||||
|
||||
if (grepl("csv", content_type, ignore.case = TRUE) ||
|
||||
grepl("\\.csv", url, ignore.case = TRUE)) {
|
||||
return(read.csv(temp_file, stringsAsFactors = FALSE))
|
||||
} else if (grepl("excel|xlsx", content_type, ignore.case = TRUE) ||
|
||||
grepl("\\.xlsx?", url, ignore.case = TRUE)) {
|
||||
# 需要 readxl 包
|
||||
if (!requireNamespace("readxl", quietly = TRUE)) {
|
||||
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
|
||||
details = "Excel 文件需要 readxl 包"))
|
||||
}
|
||||
return(as.data.frame(readxl::read_excel(temp_file)))
|
||||
} else {
|
||||
# 默认尝试 CSV
|
||||
return(read.csv(temp_file, stringsAsFactors = FALSE))
|
||||
}
|
||||
|
||||
}, error = function(e) {
|
||||
if (grepl("make_error", deparse(e$call))) {
|
||||
stop(e) # 重新抛出已格式化的错误
|
||||
}
|
||||
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
|
||||
details = paste("OSS 网络错误:", e$message)))
|
||||
})
|
||||
}
|
||||
99
r-statistics-service/utils/error_codes.R
Normal file
99
r-statistics-service/utils/error_codes.R
Normal file
@@ -0,0 +1,99 @@
|
||||
# utils/error_codes.R
|
||||
# 结构化错误码,便于 LLM 自愈
|
||||
|
||||
ERROR_CODES <- list(
|
||||
# 业务错误(可被 Planner 修复)
|
||||
E001_COLUMN_NOT_FOUND = list(
|
||||
code = "E001",
|
||||
type = "business",
|
||||
message_template = "列名 '{col}' 在数据中不存在",
|
||||
user_hint = "请检查变量名是否拼写正确"
|
||||
),
|
||||
E002_TYPE_MISMATCH = list(
|
||||
code = "E002",
|
||||
type = "business",
|
||||
message_template = "列 '{col}' 类型应为 {expected},实际为 {actual}",
|
||||
user_hint = "该列包含非数值数据,请检查数据格式"
|
||||
),
|
||||
E003_INSUFFICIENT_GROUPS = list(
|
||||
code = "E003",
|
||||
type = "business",
|
||||
message_template = "分组变量 '{col}' 应有 {expected} 个水平,实际有 {actual} 个",
|
||||
user_hint = "分组变量的取值个数不符合要求"
|
||||
),
|
||||
E004_SAMPLE_TOO_SMALL = list(
|
||||
code = "E004",
|
||||
type = "business",
|
||||
message_template = "样本量 {n} 不足,至少需要 {min_required}",
|
||||
user_hint = "数据量太少,无法进行统计分析"
|
||||
),
|
||||
|
||||
# 统计计算错误(用户友好映射)
|
||||
E005_SINGULAR_MATRIX = list(
|
||||
code = "E005",
|
||||
type = "business",
|
||||
message_template = "矩阵计算异常: {details}",
|
||||
user_hint = "变量之间可能存在多重共线性,建议移除高度相关的变量"
|
||||
),
|
||||
E006_CONVERGENCE_FAILED = list(
|
||||
code = "E006",
|
||||
type = "business",
|
||||
message_template = "模型未能收敛: {details}",
|
||||
user_hint = "算法未能找到稳定解,可能需要调整参数或检查数据"
|
||||
),
|
||||
E007_VARIANCE_ZERO = list(
|
||||
code = "E007",
|
||||
type = "business",
|
||||
message_template = "变量 '{col}' 方差为零",
|
||||
user_hint = "该列的所有值都相同,无法进行比较"
|
||||
),
|
||||
|
||||
# 系统错误(需人工介入)
|
||||
E100_INTERNAL_ERROR = list(
|
||||
code = "E100",
|
||||
type = "system",
|
||||
message_template = "内部错误: {details}",
|
||||
user_hint = "系统繁忙,请稍后重试"
|
||||
),
|
||||
E101_PACKAGE_MISSING = list(
|
||||
code = "E101",
|
||||
type = "system",
|
||||
message_template = "缺少依赖包: {package}",
|
||||
user_hint = "请联系管理员"
|
||||
)
|
||||
)
|
||||
|
||||
# R 原始错误到错误码的映射字典
|
||||
R_ERROR_MAPPING <- list(
|
||||
"system is computationally singular" = "E005_SINGULAR_MATRIX",
|
||||
"did not converge" = "E006_CONVERGENCE_FAILED",
|
||||
"constant" = "E007_VARIANCE_ZERO"
|
||||
)
|
||||
|
||||
# 构造错误响应(含用户友好提示)
|
||||
make_error <- function(error_def, ...) {
|
||||
params <- list(...)
|
||||
msg <- error_def$message_template
|
||||
for (name in names(params)) {
|
||||
msg <- gsub(paste0("\\{", name, "\\}"), as.character(params[[name]]), msg)
|
||||
}
|
||||
return(list(
|
||||
status = "error",
|
||||
error_code = error_def$code,
|
||||
error_type = error_def$type,
|
||||
message = msg,
|
||||
user_hint = error_def$user_hint
|
||||
))
|
||||
}
|
||||
|
||||
# 尝试将 R 原始错误映射为友好错误码
|
||||
map_r_error <- function(raw_error_msg) {
|
||||
for (pattern in names(R_ERROR_MAPPING)) {
|
||||
if (grepl(pattern, raw_error_msg, ignore.case = TRUE)) {
|
||||
error_key <- R_ERROR_MAPPING[[pattern]]
|
||||
return(make_error(ERROR_CODES[[error_key]], details = raw_error_msg))
|
||||
}
|
||||
}
|
||||
# 无法映射,返回通用内部错误
|
||||
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = raw_error_msg))
|
||||
}
|
||||
116
r-statistics-service/utils/guardrails.R
Normal file
116
r-statistics-service/utils/guardrails.R
Normal file
@@ -0,0 +1,116 @@
|
||||
# utils/guardrails.R
|
||||
# 统计护栏函数库
|
||||
|
||||
library(glue)
|
||||
|
||||
# 大样本优化阈值
|
||||
LARGE_SAMPLE_THRESHOLD <- 5000
|
||||
|
||||
# 护栏 Action 类型
|
||||
ACTION_BLOCK <- "Block" # 阻止执行
|
||||
ACTION_WARN <- "Warn" # 警告但继续
|
||||
ACTION_SWITCH <- "Switch" # 切换到备选方法
|
||||
|
||||
# 正态性检验(支持三种 Action)
|
||||
check_normality <- function(values, alpha = 0.05, action = ACTION_SWITCH, action_target = NULL) {
|
||||
n <- length(values)
|
||||
|
||||
# 样本量过小
|
||||
if (n < 3) {
|
||||
return(list(
|
||||
passed = TRUE,
|
||||
action = NULL,
|
||||
action_target = NULL,
|
||||
reason = "样本量过小,跳过正态性检验",
|
||||
skipped = TRUE
|
||||
))
|
||||
}
|
||||
|
||||
# 大样本优化:N > 5000 时使用抽样检验
|
||||
if (n > LARGE_SAMPLE_THRESHOLD) {
|
||||
set.seed(42)
|
||||
sampled_values <- sample(values, 1000)
|
||||
test <- shapiro.test(sampled_values)
|
||||
passed <- test$p.value >= alpha
|
||||
|
||||
return(list(
|
||||
passed = passed,
|
||||
action = if (passed) NULL else action,
|
||||
action_target = if (passed) NULL else action_target,
|
||||
p_value = test$p.value,
|
||||
reason = glue("大样本(N={n})抽样检验,{if (passed) '满足正态性' else '不满足正态性'}"),
|
||||
sampled = TRUE,
|
||||
sample_size = 1000
|
||||
))
|
||||
}
|
||||
|
||||
# 常规检验
|
||||
test <- shapiro.test(values)
|
||||
passed <- test$p.value >= alpha
|
||||
|
||||
return(list(
|
||||
passed = passed,
|
||||
action = if (passed) NULL else action,
|
||||
action_target = if (passed) NULL else action_target,
|
||||
p_value = test$p.value,
|
||||
reason = if (passed) "满足正态性" else "不满足正态性",
|
||||
sampled = FALSE
|
||||
))
|
||||
}
|
||||
|
||||
# 方差齐性检验 (Levene)
|
||||
check_homogeneity <- function(df, group_var, value_var, alpha = 0.05, action = ACTION_WARN) {
|
||||
library(car)
|
||||
|
||||
formula <- as.formula(paste(value_var, "~", group_var))
|
||||
test <- leveneTest(formula, data = df)
|
||||
p_val <- test$`Pr(>F)`[1]
|
||||
passed <- p_val >= alpha
|
||||
|
||||
return(list(
|
||||
passed = passed,
|
||||
action = if (passed) NULL else action,
|
||||
p_value = p_val,
|
||||
reason = if (passed) "方差齐性满足" else "方差不齐性"
|
||||
))
|
||||
}
|
||||
|
||||
# 样本量检验
|
||||
check_sample_size <- function(n, min_required = 3, action = ACTION_BLOCK) {
|
||||
passed <- n >= min_required
|
||||
return(list(
|
||||
passed = passed,
|
||||
action = if (passed) NULL else action,
|
||||
n = n,
|
||||
reason = if (passed) "样本量充足" else paste0("样本量不足, 需要至少 ", min_required)
|
||||
))
|
||||
}
|
||||
|
||||
# 执行护栏链(按 check_order 顺序执行)
|
||||
run_guardrail_chain <- function(guardrail_results) {
|
||||
warnings <- c()
|
||||
|
||||
for (result in guardrail_results) {
|
||||
if (!result$passed) {
|
||||
if (result$action == ACTION_BLOCK) {
|
||||
return(list(
|
||||
status = "blocked",
|
||||
reason = result$reason
|
||||
))
|
||||
} else if (result$action == ACTION_SWITCH) {
|
||||
return(list(
|
||||
status = "switch",
|
||||
target_tool = result$action_target,
|
||||
reason = result$reason
|
||||
))
|
||||
} else if (result$action == ACTION_WARN) {
|
||||
warnings <- c(warnings, result$reason)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return(list(
|
||||
status = "passed",
|
||||
warnings = warnings
|
||||
))
|
||||
}
|
||||
44
r-statistics-service/utils/result_formatter.R
Normal file
44
r-statistics-service/utils/result_formatter.R
Normal file
@@ -0,0 +1,44 @@
|
||||
# utils/result_formatter.R
|
||||
# 统计结果格式化,确保 p 值显示规范
|
||||
|
||||
# 格式化 p 值(符合 APA 规范)
|
||||
format_p_value <- function(p) {
|
||||
if (is.na(p)) return(NA)
|
||||
|
||||
if (p < 0.001) {
|
||||
return("< 0.001")
|
||||
} else {
|
||||
return(sprintf("%.3f", p))
|
||||
}
|
||||
}
|
||||
|
||||
# 构建标准化结果(包含 p_value_fmt)
|
||||
make_result <- function(p_value, statistic, method, ...) {
|
||||
list(
|
||||
p_value = p_value,
|
||||
p_value_fmt = format_p_value(p_value),
|
||||
statistic = statistic,
|
||||
method = method,
|
||||
...
|
||||
)
|
||||
}
|
||||
|
||||
# 格式化置信区间
|
||||
format_ci <- function(lower, upper, digits = 2) {
|
||||
sprintf("[%.2f, %.2f]", lower, upper)
|
||||
}
|
||||
|
||||
# 格式化效应量
|
||||
format_effect_size <- function(value, type = "d") {
|
||||
interpretation <- ""
|
||||
if (type == "d") { # Cohen's d
|
||||
if (abs(value) < 0.2) interpretation <- "微小"
|
||||
else if (abs(value) < 0.5) interpretation <- "小"
|
||||
else if (abs(value) < 0.8) interpretation <- "中等"
|
||||
else interpretation <- "大"
|
||||
}
|
||||
list(
|
||||
value = round(value, 3),
|
||||
interpretation = interpretation
|
||||
)
|
||||
}
|
||||
Reference in New Issue
Block a user