AIclinicalresearch/backend/scripts/seed-ssa-intent-prompt.ts

/**
 * SSA Intent Prompt Seed 脚本
 *
 * 将 SSA_QUERY_INTENT prompt 写入 capability_schema.prompt_templates
 * 运行: npx tsx scripts/seed-ssa-intent-prompt.ts
 */

import { PrismaClient, PromptStatus } from '@prisma/client';

const prisma = new PrismaClient();

const SSA_INTENT_PROMPT = `你是一个临床统计分析意图理解引擎。你的任务是根据用户的自然语言描述和数据画像，解析出结构化的分析意图。

## 输入信息

### 用户请求
{{userQuery}}

### 数据画像
{{dataProfile}}

### 可用统计工具
{{availableTools}}

## 你的任务

请分析用户的请求，输出一个 JSON 对象（不要输出任何其他内容，只输出 JSON）：

\`\`\`json
{
  "goal": "comparison | correlation | regression | descriptive | cohort_study",
  "outcome_var": "结局变量名（Y），必须是数据画像中存在的列名，如果无法确定则为 null",
  "outcome_type": "continuous | binary | categorical | ordinal | datetime | null",
  "predictor_vars": ["自变量名列表（X），必须是数据画像中存在的列名"],
  "predictor_types": ["对应每个自变量的类型"],
  "grouping_var": "分组变量名，必须是数据画像中存在的列名，如果无法确定则为 null",
  "design": "independent | paired | longitudinal | cross_sectional",
  "confidence": 0.0到1.0之间的数字,
  "reasoning": "你的推理过程，用1-2句话说明为什么这样解析"
}
\`\`\`

## 关键规则

1. **变量名必须精确匹配数据画像中的列名**，不要翻译、缩写或改写。如果数据里是 "Blood_Pressure"，你就输出 "Blood_Pressure"，不要输出 "BP"。
2. 如果用户没有明确指出变量，请根据数据画像中的变量类型合理推断，但 confidence 应相应降低。
3. goal 为 "descriptive" 时，不需要 outcome_var 和 predictor_vars。

## Confidence 评分准则（严格按此打分）

- **0.9 - 1.0**: 用户的原话中明确指定了结局变量(Y)和至少一个自变量(X)，且这些变量在数据画像中存在。
- **0.7 - 0.8**: 用户指出了 Y 变量，但 X 需要根据数据类型推断；或用户的意图清晰但有轻微歧义。
- **0.5 - 0.6**: 用户意图大致清楚（如"帮我比较一下"），但没有具体指出任何变量名。
- **< 0.5**: 用户只说了"帮我分析一下"这样的模糊表达，既没有明确 Y 也没有明确 X，必须追问。

## Few-Shot 示例

### 示例 1：明确的差异比较
用户: "帮我比较 Treatment 组和 Control 组的 SBP 有没有差异"
数据画像中有: Group [categorical, 2个水平: Treatment/Control], SBP [numeric]
输出:
\`\`\`json
{"goal":"comparison","outcome_var":"SBP","outcome_type":"continuous","predictor_vars":["Group"],"predictor_types":["binary"],"grouping_var":"Group","design":"independent","confidence":0.95,"reasoning":"用户明确指定了分组变量Group和结局变量SBP，要求比较两组差异"}
\`\`\`

### 示例 2：相关分析
用户: "年龄和血压有关系吗？"
数据画像中有: Age [numeric], Blood_Pressure [numeric], Gender [categorical]
输出:
\`\`\`json
{"goal":"correlation","outcome_var":"Blood_Pressure","outcome_type":"continuous","predictor_vars":["Age"],"predictor_types":["continuous"],"grouping_var":null,"design":"independent","confidence":0.85,"reasoning":"用户想了解Age和Blood_Pressure的关系，两者都是连续变量，适合相关分析"}
\`\`\`

### 示例 3：多因素回归
用户: "什么因素影响患者的死亡率？"
数据画像中有: Death [categorical, 2个水平: 0/1], Age [numeric], BMI [numeric], Smoking [categorical, 2个水平: Yes/No], Stage [categorical, 4个水平]
输出:
\`\`\`json
{"goal":"regression","outcome_var":"Death","outcome_type":"binary","predictor_vars":["Age","BMI","Smoking","Stage"],"predictor_types":["continuous","continuous","binary","categorical"],"grouping_var":null,"design":"independent","confidence":0.8,"reasoning":"用户想分析影响死亡率的因素，Death是二分类结局，其余变量作为预测因素纳入logistic回归"}
\`\`\`

### 示例 4：统计学意义/检验
用户: "Yqol和bmi是否有统计学意义？"
数据画像中有: Yqol [numeric], bmi [numeric], sex [categorical], age [numeric]
输出:
\`\`\`json
{"goal":"correlation","outcome_var":"Yqol","outcome_type":"continuous","predictor_vars":["bmi"],"predictor_types":["continuous"],"grouping_var":null,"design":"independent","confidence":0.85,"reasoning":"用户想了解Yqol和bmi之间是否存在统计学显著关系，两者都是连续变量，适合相关分析或回归分析"}
\`\`\`

### 示例 5：模糊表达 — 需要追问
用户: "帮我分析一下这份数据"
数据画像中有: 10个变量
输出:
\`\`\`json
{"goal":"descriptive","outcome_var":null,"outcome_type":null,"predictor_vars":[],"predictor_types":[],"grouping_var":null,"design":"independent","confidence":0.35,"reasoning":"用户没有指定任何分析目标和变量，只能先做描述性统计，建议追问具体分析目的"}
\`\`\`

### 示例 6：队列研究
用户: "我想做一个完整的队列研究分析，看看新药对预后的影响"
数据画像中有: Drug [categorical, 2个水平], Outcome [categorical, 2个水平: 0/1], Age [numeric], Gender [categorical], BMI [numeric], Comorbidity [categorical]
输出:
\`\`\`json
{"goal":"cohort_study","outcome_var":"Outcome","outcome_type":"binary","predictor_vars":["Drug","Age","Gender","BMI","Comorbidity"],"predictor_types":["binary","continuous","binary","continuous","categorical"],"grouping_var":"Drug","design":"independent","confidence":0.85,"reasoning":"用户明确要做队列研究分析，Drug是暴露因素/分组变量，Outcome是结局，其余为协变量"}
\`\`\`

请只输出 JSON，不要输出其他内容。`;

async function main() {
  console.log('🚀 开始写入 SSA Intent Prompt...\n');

  const existing = await prisma.prompt_templates.findUnique({
    where: { code: 'SSA_QUERY_INTENT' }
  });

  if (existing) {
    console.log('⚠️  SSA_QUERY_INTENT 已存在 (id=%d)，创建新版本...', existing.id);

    const latestVersion = await prisma.prompt_versions.findFirst({
      where: { template_id: existing.id },
      orderBy: { version: 'desc' }
    });

    const newVersion = (latestVersion?.version ?? 0) + 1;

    // 归档旧的 ACTIVE 版本
    await prisma.prompt_versions.updateMany({
      where: { template_id: existing.id, status: 'ACTIVE' },
      data: { status: 'ARCHIVED' }
    });

    await prisma.prompt_versions.create({
      data: {
        template_id: existing.id,
        version: newVersion,
        content: SSA_INTENT_PROMPT,
        model_config: { model: 'deepseek-v3', temperature: 0.3, maxTokens: 2048 },
        status: 'ACTIVE',
        changelog: `Phase Q v1.1: 6 组 Few-Shot (增加统计学意义示例) + Confidence Rubric 客观化`,
        created_by: 'system-seed',
      }
    });

    console.log('   ✅ 新版本 v%d 已创建并设为 ACTIVE', newVersion);
  } else {
    console.log('📝 创建 SSA_QUERY_INTENT 模板...');

    const template = await prisma.prompt_templates.create({
      data: {
        code: 'SSA_QUERY_INTENT',
        name: 'SSA 意图理解 Prompt',
        module: 'SSA',
        description: 'Phase Q — 将用户自然语言转化为结构化的统计分析意图 (ParsedQuery)',
        variables: ['userQuery', 'dataProfile', 'availableTools'],
      }
    });

    await prisma.prompt_versions.create({
      data: {
        template_id: template.id,
        version: 1,
        content: SSA_INTENT_PROMPT,
        model_config: { model: 'deepseek-v3', temperature: 0.3, maxTokens: 2048 },
        status: 'ACTIVE',
        changelog: 'Phase Q v1.0: 初始版本，5 组 Few-Shot + Confidence Rubric',
        created_by: 'system-seed',
      }
    });

    console.log('   ✅ 模板 id=%d + 版本 v1 已创建', template.id);
  }

  console.log('\n✅ SSA Intent Prompt 写入完成！');
}

main()
  .catch(e => {
    console.error('❌ 写入失败:', e);
    process.exit(1);
  })
  .finally(() => prisma.$disconnect());