feat(asl/extraction): Complete Tool 3 M1+M2 - skeleton pipeline and HITL workbench

M1 Skeleton Pipeline: - Scatter-dispatch + Aggregator polling pattern (PgBoss) - PKB ACL bridge (PkbBridgeService -> PkbExportService DTOs) - ExtractionSingleWorker with DeepSeek-V3 LLM extraction - PermanentExtractionError for non-retryable failures - Phantom Retry Guard (idempotent worker) - 3-step minimal frontend (Setup -> Progress -> Workbench) - 4 new DB tables (extraction_templates, project_templates, tasks, results) - 3 system templates seed (RCT, Cohort, QC) - M1 integration test suite M2 HITL Workbench: - MinerU VLM integration for high-fidelity table extraction - XML-isolated DynamicPromptBuilder with flat JSON output template - fuzzyQuoteMatch validator (3-tier confidence scoring) - SSE real-time logging via ExtractionEventBus - Schema-driven ExtractionDrawer (dynamic field rendering from template) - Excel wide-table export with flattenModuleData normalization - M2 integration test suite Critical Fixes (data normalization): - DynamicPromptBuilder: explicit flat key-value output format with example - ExtractionExcelExporter: handle both array and flat data formats - ExtractionDrawer: schema-driven rendering instead of hardcoded fields - ExtractionValidator: array-format quote verification support - SSE route: Fastify register encapsulation to bypass auth for EventSource - LLM JSON sanitizer: strip illegal control chars before JSON.parse Also includes: RVW stats verification spec, SSA expert config guide Tested: M1 pipeline test + M2 HITL test + manual frontend verification Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-25 18:29:20 +08:00
parent 371fa53956
commit f0736dbca1
40 changed files with 6138 additions and 48 deletions
--- a/r-statistics-service/tools/.Rhistory
+++ b/r-statistics-service/tools/.Rhistory
@@ -0,0 +1,76 @@
+# ========================================
+# 步骤 1: 描述性统计
+# ========================================
+# SSA-Pro 自动生成代码
+# 工具: 描述性统计
+# 时间: 2026-02-25 07:58:34.356454
+# ================================
+library(ggplot2)
+# 数据准备
+df <- read.csv("E:/test.csv")
+# 数值变量描述性统计
+numeric_vars <- sapply(df, is.numeric)
+if (any(numeric_vars)) {
+print(summary(df[, numeric_vars, drop = FALSE]))
+}
+# 分类变量频数表
+categorical_vars <- !numeric_vars
+if (any(categorical_vars)) {
+for (v in names(df)[categorical_vars]) {
+cat("\n变量:", v, "\n")
+print(table(df[[v]], useNA = "ifany"))
+}
+}
+# ======== 可视化 ========
+# Bar chart: Yqol
+p_Yqol <- ggplot(df[!is.na(df[["Yqol"]]), ], aes(x = factor(.data[["Yqol"]]))) +
+geom_bar(fill = "#3b82f6", alpha = 0.7) +
+labs(title = "Frequency of Yqol", x = "Yqol", y = "Count") +
+theme_minimal() +
+theme(axis.text.x = element_text(angle = 45, hjust = 1))
+print(p_Yqol)
+# Bar chart: sex
+p_sex <- ggplot(df[!is.na(df[["sex"]]), ], aes(x = factor(.data[["sex"]]))) +
+geom_bar(fill = "#3b82f6", alpha = 0.7) +
+labs(title = "Frequency of sex", x = "sex", y = "Count") +
+theme_minimal() +
+theme(axis.text.x = element_text(angle = 45, hjust = 1))
+print(p_sex)
+# Bar chart: smoke
+p_smoke <- ggplot(df[!is.na(df[["smoke"]]), ], aes(x = factor(.data[["smoke"]]))) +
+geom_bar(fill = "#3b82f6", alpha = 0.7) +
+labs(title = "Frequency of smoke", x = "smoke", y = "Count") +
+theme_minimal() +
+theme(axis.text.x = element_text(angle = 45, hjust = 1))
+print(p_smoke)
+# Histogram: age
+p_age <- ggplot(df[!is.na(df[["age"]]), ], aes(x = .data[["age"]])) +
+geom_histogram(fill = "#3b82f6", alpha = 0.7, bins = 30) +
+labs(title = "Distribution of age", x = "age", y = "Count") +
+theme_minimal()
+print(p_age)
+# ========================================
+# 步骤 2: 二元Logistic回归
+# ========================================
+# SSA-Pro 自动生成代码
+# 工具: 二元 Logistic 回归
+# 时间: 2026-02-25 07:58:34.813076
+# ================================
+# 数据准备
+df <- read.csv("E:/test.csv")
+# 模型拟合
+model <- glm(Yqol ~ sex + smoke + age + bmi + mouth_open + bucal_relax + toot_morph + root_number + root_curve + lenspace + denseratio + Pglevel + Pgverti + Winter + presyp + flap + operation + time + surgage + times, data = df, family = binomial(link = "logit"))
+summary(model)
+# OR 和 95% CI
+coef_summary <- summary(model)$coefficients
+OR <- exp(coef_summary[, "Estimate"])
+CI_lower <- exp(coef_summary[, "Estimate"] - 1.96 * coef_summary[, "Std. Error"])
+CI_upper <- exp(coef_summary[, "Estimate"] + 1.96 * coef_summary[, "Std. Error"])
+results <- data.frame(OR = OR, CI_lower = CI_lower, CI_upper = CI_upper,
+p_value = coef_summary[, "Pr(>|z|)"])
+print(round(results, 3))
+# 模型拟合度
+cat("AIC:", AIC(model), "\n")
+# VIF（需要 car 包）
+# library(car)
+# vif(model)