feat(ssa): Complete T-test end-to-end testing with 9 bug fixes - Phase 1 core 85% complete. R service: missing value auto-filter. Backend: error handling, variable matching, dynamic filename. Frontend: module activation, session isolation, error propagation. Full flow verified.
Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
2
r-statistics-service/.Rprofile
Normal file
2
r-statistics-service/.Rprofile
Normal file
@@ -0,0 +1,2 @@
|
||||
# renv 初始化
|
||||
source("renv/activate.R")
|
||||
64
r-statistics-service/Dockerfile
Normal file
64
r-statistics-service/Dockerfile
Normal file
@@ -0,0 +1,64 @@
|
||||
FROM rocker/r-ver:4.3
|
||||
|
||||
LABEL maintainer="dev-team@aiclinicalresearch.com"
|
||||
LABEL version="1.0.1"
|
||||
LABEL description="SSA-Pro R Statistics Service"
|
||||
|
||||
# 安装系统依赖(包括 R 包编译所需的库)
|
||||
RUN apt-get update && apt-get install -y \
|
||||
libcurl4-openssl-dev \
|
||||
libssl-dev \
|
||||
libxml2-dev \
|
||||
libsodium-dev \
|
||||
zlib1g-dev \
|
||||
libnlopt-dev \
|
||||
liblapack-dev \
|
||||
libblas-dev \
|
||||
gfortran \
|
||||
pkg-config \
|
||||
cmake \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 直接安装 R 包(简化方案,避免 renv 版本冲突)
|
||||
RUN R -e "install.packages(c( \
|
||||
'plumber', \
|
||||
'jsonlite', \
|
||||
'ggplot2', \
|
||||
'glue', \
|
||||
'dplyr', \
|
||||
'tidyr', \
|
||||
'base64enc', \
|
||||
'yaml', \
|
||||
'car', \
|
||||
'httr' \
|
||||
), repos='https://cloud.r-project.org/', Ncpus=2)"
|
||||
|
||||
# ===== 安全加固:创建非特权用户 =====
|
||||
RUN useradd -m -s /bin/bash appuser
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 复制应用代码
|
||||
COPY plumber.R plumber.R
|
||||
COPY utils/ utils/
|
||||
COPY tools/ tools/
|
||||
COPY tests/ tests/
|
||||
|
||||
# 设置目录权限
|
||||
RUN chown -R appuser:appuser /app
|
||||
|
||||
# ===== 切换到非特权用户 =====
|
||||
USER appuser
|
||||
|
||||
EXPOSE 8080
|
||||
|
||||
# 环境变量
|
||||
ENV DEV_MODE="false"
|
||||
|
||||
# 健康检查
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8080/health || exit 1
|
||||
|
||||
# 启动服务(不清理 /tmp,避免权限问题)
|
||||
CMD ["R", "-e", "plumber::plumb('plumber.R')$run(host='0.0.0.0', port=8080)"]
|
||||
23
r-statistics-service/docker-compose.yml
Normal file
23
r-statistics-service/docker-compose.yml
Normal file
@@ -0,0 +1,23 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
ssa-r-service:
|
||||
build: .
|
||||
container_name: ssa-r-statistics
|
||||
ports:
|
||||
- "8082:8080" # 主机8082 → 容器8080(REDCap占用8080/8081)
|
||||
environment:
|
||||
# 开发模式:启用热重载(每次请求重新加载工具脚本)
|
||||
- DEV_MODE=true
|
||||
volumes:
|
||||
# 开发环境挂载:支持热重载
|
||||
- ./tools:/app/tools
|
||||
- ./utils:/app/utils
|
||||
- ./tests:/app/tests
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 10s
|
||||
202
r-statistics-service/plumber.R
Normal file
202
r-statistics-service/plumber.R
Normal file
@@ -0,0 +1,202 @@
|
||||
# plumber.R
|
||||
# SSA-Pro R Statistics Service 入口文件
|
||||
#
|
||||
# 安全与性能优化:
|
||||
# - 生产环境预加载所有工具脚本
|
||||
# - tool_code 白名单正则校验(防止路径遍历攻击)
|
||||
|
||||
library(plumber)
|
||||
library(jsonlite)
|
||||
|
||||
# 环境配置
|
||||
DEV_MODE <- Sys.getenv("DEV_MODE", "false") == "true"
|
||||
|
||||
# 加载公共函数
|
||||
source("utils/error_codes.R")
|
||||
source("utils/data_loader.R")
|
||||
source("utils/guardrails.R")
|
||||
source("utils/result_formatter.R")
|
||||
|
||||
# 工具目录
|
||||
tools_dir <- "tools"
|
||||
tool_files <- list.files(tools_dir, pattern = "\\.R$", full.names = TRUE)
|
||||
|
||||
# ========== 生产环境预加载优化 ==========
|
||||
# 在服务启动时预加载所有工具脚本到独立环境
|
||||
# 避免每次请求都从磁盘读取和解析
|
||||
|
||||
# 工具缓存环境
|
||||
TOOL_CACHE <- new.env(parent = emptyenv())
|
||||
|
||||
# 预加载函数
|
||||
preload_tools <- function() {
|
||||
message("[Init] 预加载工具脚本...")
|
||||
|
||||
for (f in tool_files) {
|
||||
tool_name <- tools::file_path_sans_ext(basename(f))
|
||||
|
||||
# 创建独立环境加载工具
|
||||
tool_env <- new.env(parent = globalenv())
|
||||
source(f, local = tool_env)
|
||||
|
||||
# 检查是否实现了 run_analysis
|
||||
if (exists("run_analysis", envir = tool_env, mode = "function")) {
|
||||
TOOL_CACHE[[tool_name]] <- tool_env$run_analysis
|
||||
message(paste("[Init] 已加载:", tool_name))
|
||||
} else {
|
||||
warning(paste("[Init] 工具缺少 run_analysis 函数:", tool_name))
|
||||
}
|
||||
}
|
||||
|
||||
message(paste("[Init] 预加载完成,共", length(ls(TOOL_CACHE)), "个工具"))
|
||||
}
|
||||
|
||||
# 生产环境:启动时预加载
|
||||
# 开发环境:跳过(支持热重载)
|
||||
if (!DEV_MODE) {
|
||||
preload_tools()
|
||||
} else {
|
||||
message("[Init] DEV_MODE 启用,跳过预加载(支持热重载)")
|
||||
# 开发模式仍需首次加载
|
||||
for (f in tool_files) source(f)
|
||||
}
|
||||
|
||||
# ========== 安全校验函数 ==========
|
||||
|
||||
#' 校验 tool_code 格式(防止路径遍历攻击)
|
||||
#' @param tool_code 工具代码
|
||||
#' @return TRUE 如果格式合法,否则 FALSE
|
||||
validate_tool_code <- function(tool_code) {
|
||||
# 只允许:大写字母、数字、下划线
|
||||
# 有效示例:ST_T_TEST_IND, ST_ANOVA, T_TEST_IND
|
||||
# 无效示例:../etc/passwd, ST_TEST;rm -rf
|
||||
pattern <- "^[A-Z][A-Z0-9_]*$"
|
||||
return(grepl(pattern, tool_code))
|
||||
}
|
||||
|
||||
#' 将 tool_code 转换为工具名(小写,去除 ST_ 前缀)
|
||||
#' @param tool_code 例如 "ST_T_TEST_IND"
|
||||
#' @return 例如 "t_test_ind"
|
||||
normalize_tool_name <- function(tool_code) {
|
||||
name <- tolower(gsub("^ST_", "", tool_code))
|
||||
return(name)
|
||||
}
|
||||
|
||||
# ========== API 定义 ==========
|
||||
|
||||
#* @apiTitle SSA-Pro R Statistics Service
|
||||
#* @apiDescription 严谨型统计分析 R 引擎
|
||||
|
||||
#* 健康检查
|
||||
#* @get /health
|
||||
function() {
|
||||
list(
|
||||
status = "ok",
|
||||
timestamp = Sys.time(),
|
||||
version = "1.0.1",
|
||||
dev_mode = DEV_MODE,
|
||||
tools_loaded = if (DEV_MODE) length(tool_files) else length(ls(TOOL_CACHE))
|
||||
)
|
||||
}
|
||||
|
||||
#* 列出已加载的工具
|
||||
#* @get /api/v1/tools
|
||||
function() {
|
||||
if (DEV_MODE) {
|
||||
tools <- gsub("\\.R$", "", basename(tool_files))
|
||||
} else {
|
||||
tools <- ls(TOOL_CACHE)
|
||||
}
|
||||
|
||||
list(
|
||||
status = "ok",
|
||||
tools = tools,
|
||||
count = length(tools)
|
||||
)
|
||||
}
|
||||
|
||||
#* 执行统计工具
|
||||
#* @post /api/v1/skills/<tool_code>
|
||||
#* @param tool_code:str 工具代码(如 ST_T_TEST_IND)
|
||||
#* @serializer unboxedJSON
|
||||
function(req, tool_code) {
|
||||
tryCatch({
|
||||
|
||||
# ===== 安全校验:tool_code 白名单 =====
|
||||
if (!validate_tool_code(tool_code)) {
|
||||
return(list(
|
||||
status = "error",
|
||||
error_code = "E400",
|
||||
message = "Invalid tool code format",
|
||||
user_hint = "工具代码格式错误,只允许大写字母、数字和下划线"
|
||||
))
|
||||
}
|
||||
|
||||
# 解析请求体
|
||||
input <- jsonlite::fromJSON(req$postBody, simplifyVector = FALSE)
|
||||
|
||||
# Debug 模式:保留临时文件用于排查
|
||||
debug_mode <- isTRUE(input$debug)
|
||||
|
||||
# 统一入口函数名
|
||||
func_name <- "run_analysis"
|
||||
|
||||
# 标准化工具名
|
||||
tool_name <- normalize_tool_name(tool_code)
|
||||
tool_file <- file.path("tools", paste0(tool_name, ".R"))
|
||||
|
||||
# ===== 根据环境选择加载策略 =====
|
||||
if (DEV_MODE) {
|
||||
# 开发环境:每次请求重新加载(支持热重载)
|
||||
if (!file.exists(tool_file)) {
|
||||
return(list(
|
||||
status = "error",
|
||||
error_code = "E100",
|
||||
message = paste("Unknown tool:", tool_code),
|
||||
user_hint = "请检查工具代码是否正确"
|
||||
))
|
||||
}
|
||||
|
||||
source(tool_file)
|
||||
|
||||
if (!exists(func_name, mode = "function")) {
|
||||
return(list(
|
||||
status = "error",
|
||||
error_code = "E100",
|
||||
message = paste("Tool", tool_code, "does not implement run_analysis()"),
|
||||
user_hint = "工具脚本格式错误,请联系管理员"
|
||||
))
|
||||
}
|
||||
|
||||
# 执行分析
|
||||
result <- do.call(func_name, list(input))
|
||||
|
||||
} else {
|
||||
# 生产环境:从缓存加载
|
||||
if (!exists(tool_name, envir = TOOL_CACHE)) {
|
||||
return(list(
|
||||
status = "error",
|
||||
error_code = "E100",
|
||||
message = paste("Unknown tool:", tool_code),
|
||||
user_hint = "请检查工具代码是否正确,或联系管理员确认工具已部署"
|
||||
))
|
||||
}
|
||||
|
||||
# 从缓存获取函数并执行
|
||||
cached_func <- TOOL_CACHE[[tool_name]]
|
||||
result <- cached_func(input)
|
||||
}
|
||||
|
||||
# Debug 模式:附加临时文件路径
|
||||
if (debug_mode && !is.null(result$tmp_files)) {
|
||||
result$debug_files <- result$tmp_files
|
||||
message("[DEBUG] 临时文件已保留: ", paste(result$tmp_files, collapse = ", "))
|
||||
}
|
||||
|
||||
return(result)
|
||||
|
||||
}, error = function(e) {
|
||||
# 使用友好错误映射
|
||||
return(map_r_error(e$message))
|
||||
})
|
||||
}
|
||||
24
r-statistics-service/renv.lock
Normal file
24
r-statistics-service/renv.lock
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"R": {
|
||||
"Version": "4.3.0",
|
||||
"Repositories": [
|
||||
{
|
||||
"Name": "CRAN",
|
||||
"URL": "https://cloud.r-project.org"
|
||||
}
|
||||
]
|
||||
},
|
||||
"Packages": {
|
||||
"plumber": { "Package": "plumber", "Version": "1.2.1", "Source": "Repository" },
|
||||
"jsonlite": { "Package": "jsonlite", "Version": "1.8.8", "Source": "Repository" },
|
||||
"ggplot2": { "Package": "ggplot2", "Version": "3.4.4", "Source": "Repository" },
|
||||
"glue": { "Package": "glue", "Version": "1.7.0", "Source": "Repository" },
|
||||
"styler": { "Package": "styler", "Version": "1.10.2", "Source": "Repository" },
|
||||
"dplyr": { "Package": "dplyr", "Version": "1.1.4", "Source": "Repository" },
|
||||
"tidyr": { "Package": "tidyr", "Version": "1.3.0", "Source": "Repository" },
|
||||
"base64enc": { "Package": "base64enc", "Version": "0.1-3", "Source": "Repository" },
|
||||
"yaml": { "Package": "yaml", "Version": "2.3.8", "Source": "Repository" },
|
||||
"car": { "Package": "car", "Version": "3.1-2", "Source": "Repository" },
|
||||
"httr": { "Package": "httr", "Version": "1.4.7", "Source": "Repository" }
|
||||
}
|
||||
}
|
||||
21
r-statistics-service/tests/fixtures/normal_data.csv
vendored
Normal file
21
r-statistics-service/tests/fixtures/normal_data.csv
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
group,value
|
||||
A,10.5
|
||||
A,11.2
|
||||
A,9.8
|
||||
A,10.1
|
||||
A,11.5
|
||||
A,10.8
|
||||
A,9.5
|
||||
A,10.3
|
||||
A,11.0
|
||||
A,10.6
|
||||
B,12.3
|
||||
B,13.1
|
||||
B,11.8
|
||||
B,12.5
|
||||
B,13.0
|
||||
B,12.1
|
||||
B,11.5
|
||||
B,12.8
|
||||
B,13.2
|
||||
B,12.0
|
||||
|
17
r-statistics-service/tests/fixtures/sample_t_test.csv
vendored
Normal file
17
r-statistics-service/tests/fixtures/sample_t_test.csv
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
group,score
|
||||
A,23
|
||||
A,25
|
||||
A,27
|
||||
A,22
|
||||
A,24
|
||||
A,26
|
||||
A,21
|
||||
A,28
|
||||
B,30
|
||||
B,32
|
||||
B,28
|
||||
B,31
|
||||
B,29
|
||||
B,33
|
||||
B,27
|
||||
B,35
|
||||
|
16
r-statistics-service/tests/test_t_test.json
Normal file
16
r-statistics-service/tests/test_t_test.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"data_source": {
|
||||
"type": "inline",
|
||||
"data": {
|
||||
"group": ["A", "A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B", "B", "B"],
|
||||
"score": [23, 25, 27, 22, 24, 26, 21, 28, 30, 32, 28, 31, 29, 33, 27, 35]
|
||||
}
|
||||
},
|
||||
"params": {
|
||||
"group_var": "group",
|
||||
"value_var": "score"
|
||||
},
|
||||
"guardrails": {
|
||||
"check_normality": true
|
||||
}
|
||||
}
|
||||
217
r-statistics-service/tools/t_test_ind.R
Normal file
217
r-statistics-service/tools/t_test_ind.R
Normal file
@@ -0,0 +1,217 @@
|
||||
#' @tool_code ST_T_TEST_IND
|
||||
#' @name 独立样本 T 检验
|
||||
#' @version 1.0.0
|
||||
#' @description 比较两组独立样本的均值差异
|
||||
#' @author SSA-Pro Team
|
||||
|
||||
library(glue)
|
||||
library(ggplot2)
|
||||
library(base64enc)
|
||||
|
||||
# 统一入口函数
|
||||
run_analysis <- function(input) {
|
||||
# ===== 初始化 =====
|
||||
logs <- c()
|
||||
log_add <- function(msg) { logs <<- c(logs, paste0("[", Sys.time(), "] ", msg)) }
|
||||
tmp_files <- c()
|
||||
|
||||
# 确保退出时清理临时文件
|
||||
on.exit({
|
||||
if (length(tmp_files) > 0) {
|
||||
unlink(tmp_files)
|
||||
}
|
||||
}, add = TRUE)
|
||||
|
||||
# ===== 数据加载 =====
|
||||
log_add("开始加载输入数据")
|
||||
df <- tryCatch(
|
||||
load_input_data(input),
|
||||
error = function(e) {
|
||||
log_add(paste("数据加载失败:", e$message))
|
||||
return(NULL)
|
||||
}
|
||||
)
|
||||
|
||||
if (is.null(df)) {
|
||||
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "数据加载失败"))
|
||||
}
|
||||
log_add(glue("数据加载成功: {nrow(df)} 行, {ncol(df)} 列"))
|
||||
|
||||
p <- input$params
|
||||
guardrails_cfg <- input$guardrails
|
||||
|
||||
group_var <- p$group_var
|
||||
value_var <- p$value_var
|
||||
|
||||
# ===== 参数校验 =====
|
||||
if (!(group_var %in% names(df))) {
|
||||
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = group_var))
|
||||
}
|
||||
if (!(value_var %in% names(df))) {
|
||||
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = value_var))
|
||||
}
|
||||
|
||||
# ===== 数据清洗:移除分组变量或数值变量中的缺失值 =====
|
||||
original_rows <- nrow(df)
|
||||
|
||||
# 处理分组变量:移除 NA、空字符串、纯空白字符
|
||||
df <- df[!is.na(df[[group_var]]) & trimws(as.character(df[[group_var]])) != "", ]
|
||||
|
||||
# 处理数值变量:移除 NA
|
||||
df <- df[!is.na(df[[value_var]]), ]
|
||||
|
||||
removed_rows <- original_rows - nrow(df)
|
||||
if (removed_rows > 0) {
|
||||
log_add(glue("数据清洗: 移除 {removed_rows} 行缺失值 (剩余 {nrow(df)} 行)"))
|
||||
}
|
||||
|
||||
if (nrow(df) < 6) {
|
||||
return(make_error(ERROR_CODES$E004_SAMPLE_TOO_SMALL,
|
||||
n = nrow(df), min_required = 6))
|
||||
}
|
||||
|
||||
groups <- unique(df[[group_var]])
|
||||
if (length(groups) != 2) {
|
||||
return(make_error(ERROR_CODES$E003_INSUFFICIENT_GROUPS,
|
||||
col = group_var, expected = 2, actual = length(groups)))
|
||||
}
|
||||
|
||||
# ===== 护栏检查 =====
|
||||
guardrail_results <- list()
|
||||
method_used <- "t.test"
|
||||
warnings_list <- c()
|
||||
|
||||
# 样本量检查
|
||||
g1_vals <- df[df[[group_var]] == groups[1], value_var]
|
||||
g2_vals <- df[df[[group_var]] == groups[2], value_var]
|
||||
|
||||
sample_check <- check_sample_size(min(length(g1_vals), length(g2_vals)),
|
||||
min_required = 3,
|
||||
action = ACTION_BLOCK)
|
||||
guardrail_results <- c(guardrail_results, list(sample_check))
|
||||
log_add(glue("样本量检查: {sample_check$reason}"))
|
||||
|
||||
# 正态性检验
|
||||
if (isTRUE(guardrails_cfg$check_normality)) {
|
||||
log_add("执行正态性检验")
|
||||
|
||||
for (g in groups) {
|
||||
vals <- df[df[[group_var]] == g, value_var]
|
||||
norm_check <- check_normality(vals,
|
||||
alpha = 0.05,
|
||||
action = ACTION_SWITCH,
|
||||
action_target = "ST_MANN_WHITNEY")
|
||||
guardrail_results <- c(guardrail_results, list(norm_check))
|
||||
log_add(glue("组[{g}] 正态性检验: p = {round(norm_check$p_value, 4)}, {norm_check$reason}"))
|
||||
}
|
||||
}
|
||||
|
||||
# 执行护栏链
|
||||
guardrail_status <- run_guardrail_chain(guardrail_results)
|
||||
|
||||
if (guardrail_status$status == "blocked") {
|
||||
return(list(
|
||||
status = "blocked",
|
||||
message = guardrail_status$reason,
|
||||
trace_log = logs
|
||||
))
|
||||
}
|
||||
|
||||
if (guardrail_status$status == "switch") {
|
||||
log_add(glue("触发方法切换: {guardrail_status$reason} -> {guardrail_status$target_tool}"))
|
||||
# TODO: 调用备选方法
|
||||
# 目前先继续执行 T 检验,但添加警告
|
||||
warnings_list <- c(warnings_list, guardrail_status$reason)
|
||||
}
|
||||
|
||||
if (length(guardrail_status$warnings) > 0) {
|
||||
warnings_list <- c(warnings_list, guardrail_status$warnings)
|
||||
}
|
||||
|
||||
# ===== 核心计算 =====
|
||||
log_add("执行 T 检验")
|
||||
result <- t.test(g1_vals, g2_vals, var.equal = FALSE)
|
||||
|
||||
# ===== 生成图表 =====
|
||||
log_add("生成箱线图")
|
||||
plot_base64 <- tryCatch({
|
||||
generate_boxplot(df, group_var, value_var, tmp_files)
|
||||
}, error = function(e) {
|
||||
log_add(paste("图表生成失败:", e$message))
|
||||
NULL
|
||||
})
|
||||
|
||||
# ===== 生成可复现代码 =====
|
||||
reproducible_code <- glue('
|
||||
# SSA-Pro 自动生成代码
|
||||
# 工具: 独立样本 T 检验
|
||||
# 时间: {Sys.time()}
|
||||
# ================================
|
||||
|
||||
# 自动安装依赖
|
||||
required_packages <- c("ggplot2")
|
||||
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
|
||||
if(length(new_packages)) install.packages(new_packages, repos = "https://cloud.r-project.org")
|
||||
|
||||
library(ggplot2)
|
||||
|
||||
# 数据准备
|
||||
df <- read.csv("your_data.csv")
|
||||
group_var <- "{group_var}"
|
||||
value_var <- "{value_var}"
|
||||
|
||||
# 独立样本 T 检验 (Welch)
|
||||
g1_vals <- df[df[[group_var]] == "{groups[1]}", value_var]
|
||||
g2_vals <- df[df[[group_var]] == "{groups[2]}", value_var]
|
||||
result <- t.test(g1_vals, g2_vals, var.equal = FALSE)
|
||||
print(result)
|
||||
|
||||
# 可视化
|
||||
ggplot(df, aes(x = .data[[group_var]], y = .data[[value_var]])) +
|
||||
geom_boxplot(fill = "#3b82f6", alpha = 0.6) +
|
||||
theme_minimal() +
|
||||
labs(title = paste("Distribution of", value_var, "by", group_var))
|
||||
')
|
||||
|
||||
# ===== 返回结果 =====
|
||||
log_add("分析完成")
|
||||
|
||||
return(list(
|
||||
status = "success",
|
||||
message = "分析完成",
|
||||
warnings = if (length(warnings_list) > 0) warnings_list else NULL,
|
||||
results = list(
|
||||
method = result$method,
|
||||
statistic = jsonlite::unbox(as.numeric(result$statistic)),
|
||||
df = jsonlite::unbox(as.numeric(result$parameter)),
|
||||
p_value = jsonlite::unbox(as.numeric(result$p.value)),
|
||||
p_value_fmt = format_p_value(result$p.value),
|
||||
conf_int = as.numeric(result$conf.int),
|
||||
estimate = as.numeric(result$estimate),
|
||||
group_stats = list(
|
||||
list(group = as.character(groups[1]), n = length(g1_vals), mean = mean(g1_vals), sd = sd(g1_vals)),
|
||||
list(group = as.character(groups[2]), n = length(g2_vals), mean = mean(g2_vals), sd = sd(g2_vals))
|
||||
)
|
||||
),
|
||||
plots = if (!is.null(plot_base64)) list(plot_base64) else list(),
|
||||
trace_log = logs,
|
||||
reproducible_code = as.character(reproducible_code)
|
||||
))
|
||||
}
|
||||
|
||||
# 辅助函数:生成箱线图
|
||||
generate_boxplot <- function(df, group_var, value_var, tmp_files_ref) {
|
||||
p <- ggplot(df, aes(x = .data[[group_var]], y = .data[[value_var]])) +
|
||||
geom_boxplot(fill = "#3b82f6", alpha = 0.6) +
|
||||
theme_minimal() +
|
||||
labs(title = paste("Distribution of", value_var, "by", group_var))
|
||||
|
||||
tmp_file <- tempfile(fileext = ".png")
|
||||
|
||||
ggsave(tmp_file, p, width = 6, height = 4, dpi = 100)
|
||||
|
||||
base64_str <- base64encode(tmp_file)
|
||||
unlink(tmp_file)
|
||||
|
||||
return(paste0("data:image/png;base64,", base64_str))
|
||||
}
|
||||
128
r-statistics-service/utils/data_loader.R
Normal file
128
r-statistics-service/utils/data_loader.R
Normal file
@@ -0,0 +1,128 @@
|
||||
# utils/data_loader.R
|
||||
# 混合数据协议:自动识别 inline 数据 vs 预签名 URL
|
||||
#
|
||||
# 架构说明:
|
||||
# - R 服务不持有 OSS 密钥,遵循平台 OSS 存储规范
|
||||
# - Node.js 后端通过 storage.getUrl() 生成预签名 URL
|
||||
# - R 服务直接访问预签名 URL 下载数据
|
||||
# - 开发环境使用 ai-clinical-data-dev bucket,无需 Mock
|
||||
|
||||
library(httr)
|
||||
library(jsonlite)
|
||||
library(glue)
|
||||
|
||||
# 统一数据加载入口
|
||||
load_input_data <- function(input) {
|
||||
# 检查输入结构
|
||||
if (is.null(input$data_source)) {
|
||||
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
|
||||
details = "请求缺少 data_source 字段"))
|
||||
}
|
||||
|
||||
source_type <- input$data_source$type # "inline" | "oss"
|
||||
|
||||
if (source_type == "inline") {
|
||||
# 方式1:内联 JSON 数据(< 2MB)
|
||||
message("[DataLoader] 使用 inline 数据模式")
|
||||
|
||||
raw_data <- input$data_source$data
|
||||
|
||||
# 调试:打印原始数据结构
|
||||
message(glue("[DataLoader] 原始数据类型: {class(raw_data)}"))
|
||||
message(glue("[DataLoader] 原始数据字段: {paste(names(raw_data), collapse=', ')}"))
|
||||
|
||||
# 安全转换:处理不同的 JSON 解析结果
|
||||
if (is.data.frame(raw_data)) {
|
||||
df <- raw_data
|
||||
} else if (is.list(raw_data)) {
|
||||
# JSON 对象 {"col1": [...], "col2": [...]} -> data.frame
|
||||
# JSON 数组可能被解析为 list 而非 vector,需要先 unlist
|
||||
df <- data.frame(
|
||||
lapply(raw_data, function(x) {
|
||||
if (is.list(x)) unlist(x) else x
|
||||
}),
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
} else {
|
||||
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
|
||||
details = paste("无法解析的数据类型:", class(raw_data))))
|
||||
}
|
||||
|
||||
message(glue("[DataLoader] 转换后: {nrow(df)} 行, {ncol(df)} 列, 列名: {paste(names(df), collapse=', ')}"))
|
||||
return(df)
|
||||
|
||||
} else if (source_type == "oss") {
|
||||
# 方式2:从预签名 URL 下载(2MB - 20MB)
|
||||
# 注意:oss_url 是由 Node.js 后端生成的预签名 URL,不是 oss_key
|
||||
oss_url <- input$data_source$oss_url
|
||||
|
||||
if (is.null(oss_url) || oss_url == "") {
|
||||
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
|
||||
details = "OSS 模式缺少 oss_url 字段"))
|
||||
}
|
||||
|
||||
return(load_from_signed_url(oss_url))
|
||||
|
||||
} else {
|
||||
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
|
||||
details = paste("未知的 data_source.type:", source_type)))
|
||||
}
|
||||
}
|
||||
|
||||
# 从预签名 URL 下载数据
|
||||
#
|
||||
# @param url 预签名 URL(由 Node.js storage.getUrl() 生成)
|
||||
# @return data.frame
|
||||
#
|
||||
# 说明:开发环境和生产环境都使用真实 OSS
|
||||
# - 开发环境:ai-clinical-data-dev bucket
|
||||
# - 生产环境:ai-clinical-data bucket
|
||||
load_from_signed_url <- function(url) {
|
||||
|
||||
message(glue("[DataLoader] 从预签名 URL 下载数据"))
|
||||
|
||||
temp_file <- tempfile(fileext = ".csv")
|
||||
on.exit(unlink(temp_file))
|
||||
|
||||
tryCatch({
|
||||
# 预签名 URL 自带认证信息,直接 GET 即可
|
||||
response <- GET(url, write_disk(temp_file, overwrite = TRUE))
|
||||
|
||||
status <- status_code(response)
|
||||
if (status != 200) {
|
||||
# 403 通常表示签名过期
|
||||
if (status == 403) {
|
||||
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
|
||||
details = "预签名 URL 已过期,请重新上传数据"))
|
||||
}
|
||||
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
|
||||
details = paste("OSS 下载失败,HTTP 状态码:", status)))
|
||||
}
|
||||
|
||||
# 检测文件类型并读取
|
||||
content_type <- headers(response)$`content-type`
|
||||
|
||||
if (grepl("csv", content_type, ignore.case = TRUE) ||
|
||||
grepl("\\.csv", url, ignore.case = TRUE)) {
|
||||
return(read.csv(temp_file, stringsAsFactors = FALSE))
|
||||
} else if (grepl("excel|xlsx", content_type, ignore.case = TRUE) ||
|
||||
grepl("\\.xlsx?", url, ignore.case = TRUE)) {
|
||||
# 需要 readxl 包
|
||||
if (!requireNamespace("readxl", quietly = TRUE)) {
|
||||
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
|
||||
details = "Excel 文件需要 readxl 包"))
|
||||
}
|
||||
return(as.data.frame(readxl::read_excel(temp_file)))
|
||||
} else {
|
||||
# 默认尝试 CSV
|
||||
return(read.csv(temp_file, stringsAsFactors = FALSE))
|
||||
}
|
||||
|
||||
}, error = function(e) {
|
||||
if (grepl("make_error", deparse(e$call))) {
|
||||
stop(e) # 重新抛出已格式化的错误
|
||||
}
|
||||
stop(make_error(ERROR_CODES$E100_INTERNAL_ERROR,
|
||||
details = paste("OSS 网络错误:", e$message)))
|
||||
})
|
||||
}
|
||||
99
r-statistics-service/utils/error_codes.R
Normal file
99
r-statistics-service/utils/error_codes.R
Normal file
@@ -0,0 +1,99 @@
|
||||
# utils/error_codes.R
|
||||
# 结构化错误码,便于 LLM 自愈
|
||||
|
||||
ERROR_CODES <- list(
|
||||
# 业务错误(可被 Planner 修复)
|
||||
E001_COLUMN_NOT_FOUND = list(
|
||||
code = "E001",
|
||||
type = "business",
|
||||
message_template = "列名 '{col}' 在数据中不存在",
|
||||
user_hint = "请检查变量名是否拼写正确"
|
||||
),
|
||||
E002_TYPE_MISMATCH = list(
|
||||
code = "E002",
|
||||
type = "business",
|
||||
message_template = "列 '{col}' 类型应为 {expected},实际为 {actual}",
|
||||
user_hint = "该列包含非数值数据,请检查数据格式"
|
||||
),
|
||||
E003_INSUFFICIENT_GROUPS = list(
|
||||
code = "E003",
|
||||
type = "business",
|
||||
message_template = "分组变量 '{col}' 应有 {expected} 个水平,实际有 {actual} 个",
|
||||
user_hint = "分组变量的取值个数不符合要求"
|
||||
),
|
||||
E004_SAMPLE_TOO_SMALL = list(
|
||||
code = "E004",
|
||||
type = "business",
|
||||
message_template = "样本量 {n} 不足,至少需要 {min_required}",
|
||||
user_hint = "数据量太少,无法进行统计分析"
|
||||
),
|
||||
|
||||
# 统计计算错误(用户友好映射)
|
||||
E005_SINGULAR_MATRIX = list(
|
||||
code = "E005",
|
||||
type = "business",
|
||||
message_template = "矩阵计算异常: {details}",
|
||||
user_hint = "变量之间可能存在多重共线性,建议移除高度相关的变量"
|
||||
),
|
||||
E006_CONVERGENCE_FAILED = list(
|
||||
code = "E006",
|
||||
type = "business",
|
||||
message_template = "模型未能收敛: {details}",
|
||||
user_hint = "算法未能找到稳定解,可能需要调整参数或检查数据"
|
||||
),
|
||||
E007_VARIANCE_ZERO = list(
|
||||
code = "E007",
|
||||
type = "business",
|
||||
message_template = "变量 '{col}' 方差为零",
|
||||
user_hint = "该列的所有值都相同,无法进行比较"
|
||||
),
|
||||
|
||||
# 系统错误(需人工介入)
|
||||
E100_INTERNAL_ERROR = list(
|
||||
code = "E100",
|
||||
type = "system",
|
||||
message_template = "内部错误: {details}",
|
||||
user_hint = "系统繁忙,请稍后重试"
|
||||
),
|
||||
E101_PACKAGE_MISSING = list(
|
||||
code = "E101",
|
||||
type = "system",
|
||||
message_template = "缺少依赖包: {package}",
|
||||
user_hint = "请联系管理员"
|
||||
)
|
||||
)
|
||||
|
||||
# R 原始错误到错误码的映射字典
|
||||
R_ERROR_MAPPING <- list(
|
||||
"system is computationally singular" = "E005_SINGULAR_MATRIX",
|
||||
"did not converge" = "E006_CONVERGENCE_FAILED",
|
||||
"constant" = "E007_VARIANCE_ZERO"
|
||||
)
|
||||
|
||||
# 构造错误响应(含用户友好提示)
|
||||
make_error <- function(error_def, ...) {
|
||||
params <- list(...)
|
||||
msg <- error_def$message_template
|
||||
for (name in names(params)) {
|
||||
msg <- gsub(paste0("\\{", name, "\\}"), as.character(params[[name]]), msg)
|
||||
}
|
||||
return(list(
|
||||
status = "error",
|
||||
error_code = error_def$code,
|
||||
error_type = error_def$type,
|
||||
message = msg,
|
||||
user_hint = error_def$user_hint
|
||||
))
|
||||
}
|
||||
|
||||
# 尝试将 R 原始错误映射为友好错误码
|
||||
map_r_error <- function(raw_error_msg) {
|
||||
for (pattern in names(R_ERROR_MAPPING)) {
|
||||
if (grepl(pattern, raw_error_msg, ignore.case = TRUE)) {
|
||||
error_key <- R_ERROR_MAPPING[[pattern]]
|
||||
return(make_error(ERROR_CODES[[error_key]], details = raw_error_msg))
|
||||
}
|
||||
}
|
||||
# 无法映射,返回通用内部错误
|
||||
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = raw_error_msg))
|
||||
}
|
||||
116
r-statistics-service/utils/guardrails.R
Normal file
116
r-statistics-service/utils/guardrails.R
Normal file
@@ -0,0 +1,116 @@
|
||||
# utils/guardrails.R
|
||||
# 统计护栏函数库
|
||||
|
||||
library(glue)
|
||||
|
||||
# 大样本优化阈值
|
||||
LARGE_SAMPLE_THRESHOLD <- 5000
|
||||
|
||||
# 护栏 Action 类型
|
||||
ACTION_BLOCK <- "Block" # 阻止执行
|
||||
ACTION_WARN <- "Warn" # 警告但继续
|
||||
ACTION_SWITCH <- "Switch" # 切换到备选方法
|
||||
|
||||
# 正态性检验(支持三种 Action)
|
||||
check_normality <- function(values, alpha = 0.05, action = ACTION_SWITCH, action_target = NULL) {
|
||||
n <- length(values)
|
||||
|
||||
# 样本量过小
|
||||
if (n < 3) {
|
||||
return(list(
|
||||
passed = TRUE,
|
||||
action = NULL,
|
||||
action_target = NULL,
|
||||
reason = "样本量过小,跳过正态性检验",
|
||||
skipped = TRUE
|
||||
))
|
||||
}
|
||||
|
||||
# 大样本优化:N > 5000 时使用抽样检验
|
||||
if (n > LARGE_SAMPLE_THRESHOLD) {
|
||||
set.seed(42)
|
||||
sampled_values <- sample(values, 1000)
|
||||
test <- shapiro.test(sampled_values)
|
||||
passed <- test$p.value >= alpha
|
||||
|
||||
return(list(
|
||||
passed = passed,
|
||||
action = if (passed) NULL else action,
|
||||
action_target = if (passed) NULL else action_target,
|
||||
p_value = test$p.value,
|
||||
reason = glue("大样本(N={n})抽样检验,{if (passed) '满足正态性' else '不满足正态性'}"),
|
||||
sampled = TRUE,
|
||||
sample_size = 1000
|
||||
))
|
||||
}
|
||||
|
||||
# 常规检验
|
||||
test <- shapiro.test(values)
|
||||
passed <- test$p.value >= alpha
|
||||
|
||||
return(list(
|
||||
passed = passed,
|
||||
action = if (passed) NULL else action,
|
||||
action_target = if (passed) NULL else action_target,
|
||||
p_value = test$p.value,
|
||||
reason = if (passed) "满足正态性" else "不满足正态性",
|
||||
sampled = FALSE
|
||||
))
|
||||
}
|
||||
|
||||
# 方差齐性检验 (Levene)
|
||||
check_homogeneity <- function(df, group_var, value_var, alpha = 0.05, action = ACTION_WARN) {
|
||||
library(car)
|
||||
|
||||
formula <- as.formula(paste(value_var, "~", group_var))
|
||||
test <- leveneTest(formula, data = df)
|
||||
p_val <- test$`Pr(>F)`[1]
|
||||
passed <- p_val >= alpha
|
||||
|
||||
return(list(
|
||||
passed = passed,
|
||||
action = if (passed) NULL else action,
|
||||
p_value = p_val,
|
||||
reason = if (passed) "方差齐性满足" else "方差不齐性"
|
||||
))
|
||||
}
|
||||
|
||||
# 样本量检验
|
||||
check_sample_size <- function(n, min_required = 3, action = ACTION_BLOCK) {
|
||||
passed <- n >= min_required
|
||||
return(list(
|
||||
passed = passed,
|
||||
action = if (passed) NULL else action,
|
||||
n = n,
|
||||
reason = if (passed) "样本量充足" else paste0("样本量不足, 需要至少 ", min_required)
|
||||
))
|
||||
}
|
||||
|
||||
# 执行护栏链(按 check_order 顺序执行)
|
||||
run_guardrail_chain <- function(guardrail_results) {
|
||||
warnings <- c()
|
||||
|
||||
for (result in guardrail_results) {
|
||||
if (!result$passed) {
|
||||
if (result$action == ACTION_BLOCK) {
|
||||
return(list(
|
||||
status = "blocked",
|
||||
reason = result$reason
|
||||
))
|
||||
} else if (result$action == ACTION_SWITCH) {
|
||||
return(list(
|
||||
status = "switch",
|
||||
target_tool = result$action_target,
|
||||
reason = result$reason
|
||||
))
|
||||
} else if (result$action == ACTION_WARN) {
|
||||
warnings <- c(warnings, result$reason)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return(list(
|
||||
status = "passed",
|
||||
warnings = warnings
|
||||
))
|
||||
}
|
||||
44
r-statistics-service/utils/result_formatter.R
Normal file
44
r-statistics-service/utils/result_formatter.R
Normal file
@@ -0,0 +1,44 @@
|
||||
# utils/result_formatter.R
|
||||
# 统计结果格式化,确保 p 值显示规范
|
||||
|
||||
# 格式化 p 值(符合 APA 规范)
|
||||
format_p_value <- function(p) {
|
||||
if (is.na(p)) return(NA)
|
||||
|
||||
if (p < 0.001) {
|
||||
return("< 0.001")
|
||||
} else {
|
||||
return(sprintf("%.3f", p))
|
||||
}
|
||||
}
|
||||
|
||||
# 构建标准化结果(包含 p_value_fmt)
|
||||
make_result <- function(p_value, statistic, method, ...) {
|
||||
list(
|
||||
p_value = p_value,
|
||||
p_value_fmt = format_p_value(p_value),
|
||||
statistic = statistic,
|
||||
method = method,
|
||||
...
|
||||
)
|
||||
}
|
||||
|
||||
# 格式化置信区间
|
||||
format_ci <- function(lower, upper, digits = 2) {
|
||||
sprintf("[%.2f, %.2f]", lower, upper)
|
||||
}
|
||||
|
||||
# 格式化效应量
|
||||
format_effect_size <- function(value, type = "d") {
|
||||
interpretation <- ""
|
||||
if (type == "d") { # Cohen's d
|
||||
if (abs(value) < 0.2) interpretation <- "微小"
|
||||
else if (abs(value) < 0.5) interpretation <- "小"
|
||||
else if (abs(value) < 0.8) interpretation <- "中等"
|
||||
else interpretation <- "大"
|
||||
}
|
||||
list(
|
||||
value = round(value, 3),
|
||||
interpretation = interpretation
|
||||
)
|
||||
}
|
||||
Reference in New Issue
Block a user