AIclinicalresearch/r-statistics-service/tools/descriptive.R

#' @tool_code ST_DESCRIPTIVE
#' @name 描述性统计
#' @version 1.0.0
#' @description 数据概况与基线特征表
#' @author SSA-Pro Team

library(glue)
library(ggplot2)
library(base64enc)

run_analysis <- function(input) {
  # ===== 初始化 =====
  logs <- c()
  log_add <- function(msg) { logs <<- c(logs, paste0("[", Sys.time(), "] ", msg)) }

  on.exit({}, add = TRUE)

  # ===== 数据加载 =====
  log_add("开始加载输入数据")
  df <- tryCatch(
    load_input_data(input),
    error = function(e) {
      log_add(paste("数据加载失败:", e$message))
      return(NULL)
    }
  )

  if (is.null(df)) {
    return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "数据加载失败"))
  }
  log_add(glue("数据加载成功: {nrow(df)} 行, {ncol(df)} 列"))

  p <- input$params
  variables <- p$variables
  group_var <- p$group_var

  # Normalize group_var: ensure it's NULL or a valid non-empty string (never NA)
  if (is.null(group_var) || length(group_var) == 0 || isTRUE(is.na(group_var)) || !nzchar(trimws(as.character(group_var[1])))) {
    group_var <- NULL
  } else {
    group_var <- as.character(group_var[1])
  }

  log_add(glue("=== 输入参数 === variables: [{paste(variables, collapse=', ')}], group_var: {ifelse(is.null(group_var), 'NULL', group_var)}"))
  log_add(glue("=== 数据列 === [{paste(names(df), collapse=', ')}]"))

  # ===== 确定要分析的变量 =====
  if (is.null(variables) || length(variables) == 0) {
    variables <- names(df)
    log_add("未指定变量，分析全部列")
  }
  variables <- as.character(variables)

  # 排除分组变量本身
  if (!is.null(group_var) && group_var %in% variables) {
    variables <- setdiff(variables, group_var)
  }

  # 校验变量存在性
  missing_vars <- setdiff(variables, names(df))
  if (length(missing_vars) > 0) {
    log_add(glue("缺失变量: [{paste(missing_vars, collapse=', ')}]"))
    return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND,
                      col = paste(missing_vars, collapse = ", ")))
  }
  log_add(glue("最终分析变量 ({length(variables)}): [{paste(variables, collapse=', ')}]"))

  # 校验分组变量
  groups <- NULL
  if (!is.null(group_var)) {
    if (!(group_var %in% names(df))) {
      return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = group_var))
    }
    groups <- unique(df[[group_var]][!is.na(df[[group_var]])])
    log_add(glue("分组变量: {group_var}, 分组: {paste(groups, collapse=', ')}"))
  }

  # ===== 变量类型推断 =====
  var_types <- tryCatch({
    result <- sapply(variables, function(v) {
      vals <- df[[v]]
      if (is.null(vals)) return("categorical")
      if (isTRUE(is.numeric(vals))) {
        non_na_count <- sum(!is.na(vals))
        if (non_na_count == 0) return("categorical")
        unique_count <- length(unique(vals[!is.na(vals)]))
        unique_ratio <- unique_count / non_na_count
        if (isTRUE(unique_ratio < 0.05) && isTRUE(unique_count <= 10)) {
          return("categorical")
        }
        return("numeric")
      } else {
        return("categorical")
      }
    })
    if (is.null(names(result))) names(result) <- variables
    result
  }, error = function(e) {
    log_add(paste("变量类型推断失败:", e$message))
    setNames(rep("categorical", length(variables)), variables)
  })

  log_add(glue("数值变量: {sum(var_types == 'numeric', na.rm=TRUE)}, 分类变量: {sum(var_types == 'categorical', na.rm=TRUE)}"))
  log_add(glue("var_types 详情: {paste(names(var_types), '=', var_types, collapse=', ')}"))

  # ===== 计算描述性统计 =====
  warnings_list <- c()
  results_list <- list()

  for (v in variables) {
    var_type <- as.character(var_types[v])
    if (is.na(var_type) || length(var_type) == 0) {
      var_type <- "categorical"  # 默认为分类变量
    }

    if (is.null(groups)) {
      # 无分组
      if (identical(var_type, "numeric")) {
        stats <- calc_numeric_stats(df[[v]], v)
      } else {
        stats <- calc_categorical_stats(df[[v]], v)
      }
      stats$type <- var_type
      results_list[[v]] <- stats
    } else {
      # 有分组
      group_stats <- list()
      for (g in groups) {
        mask <- df[[group_var]] == g & !is.na(df[[group_var]])
        subset_vals <- df[mask, v, drop = TRUE]
        if (identical(var_type, "numeric")) {
          group_stats[[as.character(g)]] <- calc_numeric_stats(subset_vals, v)
        } else {
          group_stats[[as.character(g)]] <- calc_categorical_stats(subset_vals, v)
        }
      }
      results_list[[v]] <- list(
        variable = v,
        type = var_type,
        by_group = group_stats
      )
    }
  }

  # ===== 总体概况 =====
  summary_stats <- list(
    n_total = nrow(df),
    n_variables = length(variables),
    n_numeric = sum(var_types == "numeric"),
    n_categorical = sum(var_types == "categorical")
  )

  if (!is.null(groups)) {
    summary_stats$group_var <- group_var
    summary_stats$groups <- lapply(groups, function(g) {
      list(name = as.character(g), n = sum(df[[group_var]] == g, na.rm = TRUE))
    })
  }

  # ===== 生成图表 =====
  log_add("生成描述性统计图表")
  plots <- list()

  # 只为前几个变量生成图表（避免过多）
  vars_to_plot <- head(variables, 4)

  for (v in vars_to_plot) {
    plot_base64 <- tryCatch({
      if (isTRUE(var_types[v] == "numeric")) {
        generate_histogram(df, v, group_var)
      } else {
        generate_bar_chart(df, v, group_var)
      }
    }, error = function(e) {
      log_add(paste("图表生成失败:", v, e$message))
      NULL
    })

    if (!is.null(plot_base64)) {
      plots <- c(plots, list(plot_base64))
    }
  }

  # ===== 生成可复现代码 =====
  original_filename <- if (!is.null(input$original_filename) && nchar(input$original_filename) > 0) {
    input$original_filename
  } else {
    "data.csv"
  }

  # Build dynamic visualization code based on actual variables
  plot_code_section <- tryCatch({
    plot_code_lines <- c()
    for (v in vars_to_plot) {
      safe_v <- gsub('"', '\\\\"', v)
      vt <- if (is.null(var_types) || is.na(var_types[v])) "categorical" else as.character(var_types[v])
      safe_var_name <- gsub("[^a-zA-Z0-9]", "_", v)
      if (vt == "numeric") {
        if (!is.null(group_var) && group_var != "") {
          safe_g <- gsub('"', '\\\\"', group_var)
          plot_code_lines <- c(plot_code_lines, glue('
# Histogram: {safe_v}
p_{safe_var_name} <- ggplot(df[!is.na(df[["{safe_v}"]]), ], aes(x = .data[["{safe_v}"]], fill = factor(.data[["{safe_g}"]]))) +
  geom_histogram(alpha = 0.6, position = "identity", bins = 30) +
  scale_fill_brewer(palette = "Set1", name = "{safe_g}") +
  labs(title = "Distribution of {safe_v}", x = "{safe_v}", y = "Count") +
  theme_minimal()
print(p_{safe_var_name})
'))
        } else {
          plot_code_lines <- c(plot_code_lines, glue('
# Histogram: {safe_v}
p_{safe_var_name} <- ggplot(df[!is.na(df[["{safe_v}"]]), ], aes(x = .data[["{safe_v}"]])) +
  geom_histogram(fill = "#3b82f6", alpha = 0.7, bins = 30) +
  labs(title = "Distribution of {safe_v}", x = "{safe_v}", y = "Count") +
  theme_minimal()
print(p_{safe_var_name})
'))
        }
      } else {
        if (!is.null(group_var) && group_var != "") {
          safe_g <- gsub('"', '\\\\"', group_var)
          plot_code_lines <- c(plot_code_lines, glue('
# Bar chart: {safe_v}
p_{safe_var_name} <- ggplot(df[!is.na(df[["{safe_v}"]]), ], aes(x = factor(.data[["{safe_v}"]]), fill = factor(.data[["{safe_g}"]]))) +
  geom_bar(position = "dodge") +
  scale_fill_brewer(palette = "Set1", name = "{safe_g}") +
  labs(title = "Frequency of {safe_v}", x = "{safe_v}", y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p_{safe_var_name})
'))
        } else {
          plot_code_lines <- c(plot_code_lines, glue('
# Bar chart: {safe_v}
p_{safe_var_name} <- ggplot(df[!is.na(df[["{safe_v}"]]), ], aes(x = factor(.data[["{safe_v}"]]))) +
  geom_bar(fill = "#3b82f6", alpha = 0.7) +
  labs(title = "Frequency of {safe_v}", x = "{safe_v}", y = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p_{safe_var_name})
'))
        }
      }
    }
    paste(plot_code_lines, collapse = "\n")
  }, error = function(e) {
    log_add(paste("reproducible_code visualization generation failed:", e$message))
    "# ggplot(df, aes(x = your_variable)) + geom_histogram()"
  })

  reproducible_code <- glue('
# SSA-Pro 自动生成代码
# 工具: 描述性统计
# 时间: {Sys.time()}
# ================================

library(ggplot2)

# 数据准备
df <- read.csv("{original_filename}")

# 数值变量描述性统计
numeric_vars <- sapply(df, is.numeric)
if (any(numeric_vars)) {{
  print(summary(df[, numeric_vars, drop = FALSE]))
}}

# 分类变量频数表
categorical_vars <- !numeric_vars
if (any(categorical_vars)) {{
  for (v in names(df)[categorical_vars]) {{
    cat("\\n变量:", v, "\\n")
    print(table(df[[v]], useNA = "ifany"))
  }}
}}

# ======== 可视化 ========
{plot_code_section}
')

  # ===== 返回结果 =====
  log_add("分析完成")

  # ===== 构建 report_blocks =====
  blocks <- list()

  # Block 1: 数据概况
  kv_items <- list(
    "总样本量" = as.character(summary_stats$n_total),
    "变量数" = as.character(summary_stats$n_variables),
    "数值变量数" = as.character(summary_stats$n_numeric),
    "分类变量数" = as.character(summary_stats$n_categorical)
  )
  if (!is.null(groups)) {
    kv_items$group_var <- group_var
    kv_items$groups <- paste(sapply(summary_stats$groups, function(g) paste0(g$name, "(n=", g$n, ")")), collapse = ", ")
  }
  blocks[[length(blocks) + 1]] <- make_kv_block(kv_items, title = "数据概况")

  # Block 2: 数值变量汇总表
  numeric_vars <- names(results_list)[sapply(results_list, function(x) {
    if (is.list(x) && !is.null(x$type)) x$type == "numeric" else FALSE
  })]
  if (length(numeric_vars) > 0) {
    if (is.null(groups)) {
      num_headers <- c("变量名", "n", "mean", "sd", "median", "Q1", "Q3", "min", "max")
      num_rows <- lapply(numeric_vars, function(v) {
        s <- results_list[[v]]
        c(v, as.character(s$n), as.character(s$mean), as.character(s$sd),
          as.character(s$median), as.character(s$q1), as.character(s$q3),
          as.character(s$min), as.character(s$max))
      })
    } else {
      num_headers <- c("变量名", as.character(groups))
      num_rows <- lapply(numeric_vars, function(v) {
        s <- results_list[[v]]
        row <- c(v)
        for (g in groups) {
          gs <- s$by_group[[as.character(g)]]
          row <- c(row, if (!is.null(gs$formatted)) gs$formatted else "-")
        }
        row
      })
    }
    blocks[[length(blocks) + 1]] <- make_table_block(num_headers, num_rows, title = "数值变量汇总表")
  }

  # Block 3: 分类变量汇总表
  cat_vars <- names(results_list)[sapply(results_list, function(x) {
    if (is.list(x) && !is.null(x$type)) x$type == "categorical" else FALSE
  })]
  if (length(cat_vars) > 0) {
    cat_headers <- c("变量名", "水平", "n", "百分比")
    cat_rows <- list()
    for (v in cat_vars) {
      s <- results_list[[v]]
      if (is.null(groups)) {
        for (lev in s$levels) {
          cat_rows[[length(cat_rows) + 1]] <- c(v, lev$level, as.character(lev$n), paste0(lev$pct, "%"))
        }
      } else {
        for (g in groups) {
          gs <- s$by_group[[as.character(g)]]
          for (lev in gs$levels) {
            cat_rows[[length(cat_rows) + 1]] <- c(paste0(v, " (", g, ")"), lev$level, as.character(lev$n), paste0(lev$pct, "%"))
          }
        }
      }
    }
    if (length(cat_rows) > 0) {
      blocks[[length(blocks) + 1]] <- make_table_block(cat_headers, cat_rows, title = "分类变量汇总表")
    }
  }

  # Block 4+: 各图表
  for (i in seq_along(plots)) {
    blocks[[length(blocks) + 1]] <- make_image_block(plots[[i]], title = paste0("图表 ", i), alt = paste0("描述性统计图 ", i))
  }

  return(list(
    status = "success",
    message = "分析完成",
    warnings = if (length(warnings_list) > 0) warnings_list else NULL,
    results = list(
      summary = summary_stats,
      variables = results_list
    ),
    report_blocks = blocks,
    plots = plots,
    trace_log = logs,
    reproducible_code = as.character(reproducible_code)
  ))
}

# ===== 辅助函数 =====

# 数值变量统计
calc_numeric_stats <- function(vals, var_name) {
  vals <- vals[!is.na(vals)]
  n <- length(vals)

  if (n == 0) {
    return(list(
      variable = var_name,
      n = 0,
      missing = length(vals) - n,
      stats = NULL
    ))
  }

  list(
    variable = var_name,
    n = n,
    missing = 0,
    mean = round(mean(vals), 3),
    sd = round(sd(vals), 3),
    median = round(median(vals), 3),
    q1 = round(quantile(vals, 0.25), 3),
    q3 = round(quantile(vals, 0.75), 3),
    iqr = round(IQR(vals), 3),
    min = round(min(vals), 3),
    max = round(max(vals), 3),
    skewness = round(calc_skewness(vals), 3),
    formatted = paste0(round(mean(vals), 2), " ± ", round(sd(vals), 2))
  )
}

# 分类变量统计
calc_categorical_stats <- function(vals, var_name) {
  total <- length(vals)
  valid <- sum(!is.na(vals))

  freq_table <- table(vals, useNA = "no")

  levels_list <- lapply(names(freq_table), function(level) {
    count <- as.numeric(freq_table[level])
    pct <- round(count / valid * 100, 1)
    list(
      level = level,
      n = count,
      pct = pct,
      formatted = paste0(count, " (", pct, "%)")
    )
  })

  list(
    variable = var_name,
    n = valid,
    missing = total - valid,
    levels = levels_list
  )
}

# 计算偏度
calc_skewness <- function(x) {
  n <- length(x)
  if (n < 3) return(NA)
  m <- mean(x)
  s <- sd(x)
  sum((x - m)^3) / (n * s^3)
}

# 生成直方图
generate_histogram <- function(df, var_name, group_var = NULL) {
  if (!is.null(group_var) && group_var != "") {
    p <- ggplot(df[!is.na(df[[var_name]]), ], aes(x = .data[[var_name]], fill = factor(.data[[group_var]]))) +
      geom_histogram(alpha = 0.6, position = "identity", bins = 30) +
      scale_fill_brewer(palette = "Set1", name = group_var) +
      theme_minimal()
  } else {
    p <- ggplot(df[!is.na(df[[var_name]]), ], aes(x = .data[[var_name]])) +
      geom_histogram(fill = "#3b82f6", alpha = 0.7, bins = 30) +
      theme_minimal()
  }

  p <- p + labs(title = paste("Distribution of", var_name), x = var_name, y = "Count")

  tmp_file <- tempfile(fileext = ".png")
  ggsave(tmp_file, p, width = 6, height = 4, dpi = 100)
  base64_str <- base64encode(tmp_file)
  unlink(tmp_file)

  return(paste0("data:image/png;base64,", base64_str))
}

# 生成柱状图
generate_bar_chart <- function(df, var_name, group_var = NULL) {
  df_plot <- df[!is.na(df[[var_name]]), ]

  if (!is.null(group_var) && group_var != "") {
    p <- ggplot(df_plot, aes(x = factor(.data[[var_name]]), fill = factor(.data[[group_var]]))) +
      geom_bar(position = "dodge") +
      scale_fill_brewer(palette = "Set1", name = group_var) +
      theme_minimal()
  } else {
    p <- ggplot(df_plot, aes(x = factor(.data[[var_name]]))) +
      geom_bar(fill = "#3b82f6", alpha = 0.7) +
      theme_minimal()
  }

  p <- p + labs(title = paste("Frequency of", var_name), x = var_name, y = "Count") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

  tmp_file <- tempfile(fileext = ".png")
  ggsave(tmp_file, p, width = 6, height = 4, dpi = 100)
  base64_str <- base64encode(tmp_file)
  unlink(tmp_file)

  return(paste0("data:image/png;base64,", base64_str))
}