Files
HaHafeng 3446909ff7 feat(ssa): Complete Phase I-IV intelligent dialogue and tool system development
Phase I - Session Blackboard + READ Layer:
- SessionBlackboardService with Postgres-Only cache
- DataProfileService for data overview generation
- PicoInferenceService for LLM-driven PICO extraction
- Frontend DataContextCard and VariableDictionaryPanel
- E2E tests: 31/31 passed

Phase II - Conversation Layer LLM + Intent Router:
- ConversationService with SSE streaming
- IntentRouterService (rule-first + LLM fallback, 6 intents)
- SystemPromptService with 6-segment dynamic assembly
- TokenTruncationService for context management
- ChatHandlerService as unified chat entry
- Frontend SSAChatPane and useSSAChat hook
- E2E tests: 38/38 passed

Phase III - Method Consultation + AskUser Standardization:
- ToolRegistryService with Repository Pattern
- MethodConsultService with DecisionTable + LLM enhancement
- AskUserService with global interrupt handling
- Frontend AskUserCard component
- E2E tests: 13/13 passed

Phase IV - Dialogue-Driven Analysis + QPER Integration:
- ToolOrchestratorService (plan/execute/report)
- analysis_plan SSE event for WorkflowPlan transmission
- Dual-channel confirmation (ask_user card + workspace button)
- PICO as optional hint for LLM parsing
- E2E tests: 25/25 passed

R Statistics Service:
- 5 new R tools: anova_one, baseline_table, fisher, linear_reg, wilcoxon
- Enhanced guardrails and block helpers
- Comprehensive test suite (run_all_tools_test.js)

Documentation:
- Updated system status document (v5.9)
- Updated SSA module status and development plan (v1.8)

Total E2E: 107/107 passed (Phase I: 31, Phase II: 38, Phase III: 13, Phase IV: 25)

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-22 18:53:39 +08:00

273 lines
8.4 KiB
R
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#' @tool_code ST_FISHER
#' @name Fisher 精确检验
#' @version 1.0.0
#' @description 小样本或稀疏列联表的精确独立性检验(卡方检验的替代方法)
#' @author SSA-Pro Team
library(glue)
library(ggplot2)
library(base64enc)
run_analysis <- function(input) {
# ===== 初始化 =====
logs <- c()
log_add <- function(msg) { logs <<- c(logs, paste0("[", Sys.time(), "] ", msg)) }
on.exit({}, add = TRUE)
# ===== 数据加载 =====
log_add("开始加载输入数据")
df <- tryCatch(
load_input_data(input),
error = function(e) {
log_add(paste("数据加载失败:", e$message))
return(NULL)
}
)
if (is.null(df)) {
return(make_error(ERROR_CODES$E100_INTERNAL_ERROR, details = "数据加载失败"))
}
log_add(glue("数据加载成功: {nrow(df)} 行, {ncol(df)} 列"))
p <- input$params
var1 <- p$var1
var2 <- p$var2
# ===== 参数校验 =====
if (!(var1 %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = var1))
}
if (!(var2 %in% names(df))) {
return(make_error(ERROR_CODES$E001_COLUMN_NOT_FOUND, col = var2))
}
# ===== 数据清洗 =====
original_rows <- nrow(df)
df <- df[!is.na(df[[var1]]) & trimws(as.character(df[[var1]])) != "", ]
df <- df[!is.na(df[[var2]]) & trimws(as.character(df[[var2]])) != "", ]
removed_rows <- original_rows - nrow(df)
if (removed_rows > 0) {
log_add(glue("数据清洗: 移除 {removed_rows} 行缺失值 (剩余 {nrow(df)} 行)"))
}
# ===== 护栏检查 =====
guardrail_results <- list()
warnings_list <- c()
sample_check <- check_sample_size(nrow(df), min_required = 4, action = ACTION_BLOCK)
guardrail_results <- c(guardrail_results, list(sample_check))
log_add(glue("样本量检查: N = {nrow(df)}, {sample_check$reason}"))
guardrail_status <- run_guardrail_chain(guardrail_results)
if (guardrail_status$status == "blocked") {
return(list(
status = "blocked",
message = guardrail_status$reason,
trace_log = logs
))
}
# ===== 构建列联表 =====
contingency_table <- table(df[[var1]], df[[var2]])
log_add(glue("列联表维度: {nrow(contingency_table)} x {ncol(contingency_table)}"))
if (nrow(contingency_table) < 2 || ncol(contingency_table) < 2) {
return(make_error(ERROR_CODES$E003_INSUFFICIENT_GROUPS,
col = paste(var1, "或", var2),
expected = 2,
actual = min(nrow(contingency_table), ncol(contingency_table))))
}
is_2x2 <- nrow(contingency_table) == 2 && ncol(contingency_table) == 2
# 期望频数信息(仅供报告)
expected <- chisq.test(contingency_table)$expected
low_expected_count <- sum(expected < 5)
total_cells <- length(expected)
low_expected_pct <- low_expected_count / total_cells
if (low_expected_pct > 0) {
log_add(glue("期望频数 < 5 的格子: {low_expected_count}/{total_cells} ({round(low_expected_pct * 100, 1)}%)"))
}
# ===== 核心计算 =====
log_add("执行 Fisher 精确检验")
result <- tryCatch({
if (is_2x2) {
fisher.test(contingency_table)
} else {
fisher.test(contingency_table, simulate.p.value = TRUE, B = 10000)
}
}, error = function(e) {
log_add(paste("Fisher 检验失败:", e$message))
return(NULL)
})
if (is.null(result)) {
return(map_r_error("Fisher 精确检验计算失败,列联表可能过大"))
}
method_used <- result$method
output_results <- list(
method = method_used,
p_value = jsonlite::unbox(as.numeric(result$p.value)),
p_value_fmt = format_p_value(result$p.value)
)
if (!is.null(result$estimate)) {
output_results$odds_ratio = jsonlite::unbox(as.numeric(result$estimate))
}
if (!is.null(result$conf.int)) {
output_results$conf_int = as.numeric(result$conf.int)
}
observed_matrix <- matrix(
as.numeric(contingency_table),
nrow = nrow(contingency_table),
ncol = ncol(contingency_table),
dimnames = list(rownames(contingency_table), colnames(contingency_table))
)
output_results$contingency_table <- list(
row_var = var1,
col_var = var2,
row_levels = as.character(rownames(contingency_table)),
col_levels = as.character(colnames(contingency_table)),
observed = observed_matrix,
row_totals = as.numeric(rowSums(contingency_table)),
col_totals = as.numeric(colSums(contingency_table)),
grand_total = jsonlite::unbox(sum(contingency_table))
)
log_add(glue("P = {round(result$p.value, 4)}"))
# ===== 生成图表 =====
log_add("生成堆叠条形图")
plot_base64 <- tryCatch({
generate_stacked_bar(contingency_table, var1, var2)
}, error = function(e) {
log_add(paste("图表生成失败:", e$message))
NULL
})
# ===== 生成可复现代码 =====
original_filename <- if (!is.null(input$original_filename) && nchar(input$original_filename) > 0) {
input$original_filename
} else {
"data.csv"
}
reproducible_code <- glue('
# SSA-Pro 自动生成代码
# 工具: Fisher 精确检验
# 时间: {Sys.time()}
# ================================
library(ggplot2)
# 数据准备
df <- read.csv("{original_filename}")
var1 <- "{var1}"
var2 <- "{var2}"
# 数据清洗
df <- df[!is.na(df[[var1]]) & !is.na(df[[var2]]), ]
# 构建列联表
contingency_table <- table(df[[var1]], df[[var2]])
print(contingency_table)
# Fisher 精确检验
result <- fisher.test(contingency_table)
print(result)
')
# ===== 构建 report_blocks =====
blocks <- list()
# Block 1: 列联表
table_headers <- c(var1, as.character(colnames(contingency_table)))
table_rows <- lapply(seq_len(nrow(contingency_table)), function(i) {
c(as.character(rownames(contingency_table)[i]), as.character(contingency_table[i, ]))
})
blocks[[length(blocks) + 1]] <- make_table_block(table_headers, table_rows, title = "列联表")
# Block 2: 检验结果
kv_items <- list(
"方法" = method_used,
"P 值" = output_results$p_value_fmt
)
if (!is.null(output_results$odds_ratio)) {
kv_items[["比值比 (OR)"]] <- as.character(round(as.numeric(output_results$odds_ratio), 4))
}
if (!is.null(output_results$conf_int)) {
kv_items[["95% 置信区间"]] <- sprintf("[%.4f, %.4f]", output_results$conf_int[1], output_results$conf_int[2])
}
if (low_expected_count > 0) {
kv_items[["期望频数 < 5 的格子"]] <- glue("{low_expected_count}/{total_cells}")
}
blocks[[length(blocks) + 1]] <- make_kv_block(kv_items, title = "检验结果")
# Block 3: 图表
if (!is.null(plot_base64)) {
blocks[[length(blocks) + 1]] <- make_image_block(plot_base64, title = "堆叠条形图",
alt = paste("堆叠条形图:", var1, "与", var2, "的关联"))
}
# Block 4: 结论摘要
p_val <- as.numeric(output_results$p_value)
conclusion <- if (p_val < 0.05) {
glue("Fisher 精确检验显示,{var1} 与 {var2} 之间存在显著关联P {output_results$p_value_fmt})。")
} else {
glue("Fisher 精确检验显示,未发现 {var1} 与 {var2} 之间的显著关联P {output_results$p_value_fmt})。")
}
if (!is.null(output_results$odds_ratio)) {
conclusion <- paste0(conclusion, glue(" 比值比 OR = {round(as.numeric(output_results$odds_ratio), 3)}。"))
}
blocks[[length(blocks) + 1]] <- make_markdown_block(conclusion, title = "结论摘要")
# ===== 返回结果 =====
log_add("分析完成")
return(list(
status = "success",
message = "分析完成",
warnings = if (length(warnings_list) > 0) warnings_list else NULL,
results = output_results,
report_blocks = blocks,
plots = if (!is.null(plot_base64)) list(plot_base64) else list(),
trace_log = logs,
reproducible_code = as.character(reproducible_code)
))
}
# 辅助函数:堆叠条形图
generate_stacked_bar <- function(contingency_table, var1, var2) {
df_plot <- as.data.frame(contingency_table)
names(df_plot) <- c("Var1", "Var2", "Freq")
p <- ggplot(df_plot, aes(x = Var1, y = Freq, fill = Var2)) +
geom_bar(stat = "identity", position = "fill") +
scale_y_continuous(labels = scales::percent) +
theme_minimal() +
labs(
title = paste("Association:", var1, "vs", var2),
x = var1,
y = "Proportion",
fill = var2
) +
scale_fill_brewer(palette = "Set2")
tmp_file <- tempfile(fileext = ".png")
ggsave(tmp_file, p, width = 7, height = 5, dpi = 100)
base64_str <- base64encode(tmp_file)
unlink(tmp_file)
return(paste0("data:image/png;base64,", base64_str))
}