M1 Skeleton Pipeline: - Scatter-dispatch + Aggregator polling pattern (PgBoss) - PKB ACL bridge (PkbBridgeService -> PkbExportService DTOs) - ExtractionSingleWorker with DeepSeek-V3 LLM extraction - PermanentExtractionError for non-retryable failures - Phantom Retry Guard (idempotent worker) - 3-step minimal frontend (Setup -> Progress -> Workbench) - 4 new DB tables (extraction_templates, project_templates, tasks, results) - 3 system templates seed (RCT, Cohort, QC) - M1 integration test suite M2 HITL Workbench: - MinerU VLM integration for high-fidelity table extraction - XML-isolated DynamicPromptBuilder with flat JSON output template - fuzzyQuoteMatch validator (3-tier confidence scoring) - SSE real-time logging via ExtractionEventBus - Schema-driven ExtractionDrawer (dynamic field rendering from template) - Excel wide-table export with flattenModuleData normalization - M2 integration test suite Critical Fixes (data normalization): - DynamicPromptBuilder: explicit flat key-value output format with example - ExtractionExcelExporter: handle both array and flat data formats - ExtractionDrawer: schema-driven rendering instead of hardcoded fields - ExtractionValidator: array-format quote verification support - SSE route: Fastify register encapsulation to bypass auth for EventSource - LLM JSON sanitizer: strip illegal control chars before JSON.parse Also includes: RVW stats verification spec, SSA expert config guide Tested: M1 pipeline test + M2 HITL test + manual frontend verification Co-authored-by: Cursor <cursoragent@cursor.com>
183 lines
11 KiB
TypeScript
183 lines
11 KiB
TypeScript
import { PrismaClient } from '@prisma/client';
|
||
|
||
const prisma = new PrismaClient();
|
||
|
||
/**
|
||
* ASL 工具 3 系统内置提取模板 Seed
|
||
* 3 套模板:RCT / Cohort / QC
|
||
* 字段定义来自《ASL 工具 3 全文提取数据字典与规范》
|
||
*/
|
||
const SYSTEM_TEMPLATES = [
|
||
{
|
||
code: 'RCT',
|
||
name: '随机对照试验 (RCT)',
|
||
description: '适用于随机对照试验文献的结构化数据提取,包含基线特征、RoB 2.0 偏倚风险评估和多种结局指标类型',
|
||
baseFields: {
|
||
metadata: [
|
||
{ key: 'study_id', label: '研究标识', type: 'string', description: '第一作者+年份,如 Gandhi 2018' },
|
||
{ key: 'nct_number', label: '临床试验注册号', type: 'string', description: 'ClinicalTrials.gov 注册号' },
|
||
{ key: 'study_design', label: '研究设计类型', type: 'string', description: '如 RCT, Phase III RCT' },
|
||
{ key: 'funding_source', label: '资金来源', type: 'string', description: '资助方与利益冲突声明' },
|
||
],
|
||
baseline: [
|
||
{ key: 'treatment_name', label: '实验组干预', type: 'string', description: '含剂量/频次' },
|
||
{ key: 'control_name', label: '对照组干预', type: 'string', description: '如 Placebo' },
|
||
{ key: 'n_treatment', label: '实验组样本量', type: 'integer', description: 'Table 1 中 N=xxx' },
|
||
{ key: 'n_control', label: '对照组样本量', type: 'integer', description: 'Table 1 中 N=xxx' },
|
||
{ key: 'age_treatment', label: '实验组年龄', type: 'string', description: 'Mean±SD 或 Median(IQR)' },
|
||
{ key: 'age_control', label: '对照组年龄', type: 'string', description: 'Mean±SD 或 Median(IQR)' },
|
||
{ key: 'male_percent', label: '男性比例(%)', type: 'string', description: '整体或分组' },
|
||
],
|
||
rob: [
|
||
{ key: 'rob_randomization', label: '随机序列产生', type: 'string', description: 'Low/High/Unclear Risk' },
|
||
{ key: 'rob_allocation', label: '分配隐藏', type: 'string', description: 'Low/High/Unclear Risk' },
|
||
{ key: 'rob_blinding', label: '盲法实施', type: 'string', description: 'Low/High/Unclear Risk' },
|
||
{ key: 'rob_attrition', label: '失访与数据完整性', type: 'string', description: 'Low/High/Unclear Risk' },
|
||
],
|
||
outcomes_survival: [
|
||
{ key: 'endpoint_name', label: '终点名称', type: 'string', description: '如 OS, PFS, MACE' },
|
||
{ key: 'hr_value', label: '风险比 (HR)', type: 'number', description: 'Hazard Ratio' },
|
||
{ key: 'hr_ci_lower', label: '95%CI 下限', type: 'number', description: '' },
|
||
{ key: 'hr_ci_upper', label: '95%CI 上限', type: 'number', description: '' },
|
||
{ key: 'p_value', label: 'P 值', type: 'string', description: '如 <0.001 或 0.032' },
|
||
],
|
||
outcomes_dichotomous: [
|
||
{ key: 'event_treatment', label: '实验组事件数', type: 'integer', description: '发生事件的具体人数' },
|
||
{ key: 'total_treatment', label: '实验组分析总人数', type: 'integer', description: '可能与基线总人数不同' },
|
||
{ key: 'event_control', label: '对照组事件数', type: 'integer', description: '' },
|
||
{ key: 'total_control', label: '对照组分析总人数', type: 'integer', description: '' },
|
||
],
|
||
outcomes_continuous: [
|
||
{ key: 'mean_treatment', label: '实验组均值', type: 'number', description: '' },
|
||
{ key: 'sd_treatment', label: '实验组标准差', type: 'number', description: 'SD,若原文为 SE/CI 需换算' },
|
||
{ key: 'n_treatment_outcome', label: '实验组分析人数', type: 'integer', description: '' },
|
||
{ key: 'mean_control', label: '对照组均值', type: 'number', description: '' },
|
||
{ key: 'sd_control', label: '对照组标准差', type: 'number', description: '' },
|
||
{ key: 'n_control_outcome', label: '对照组分析人数', type: 'integer', description: '' },
|
||
],
|
||
},
|
||
},
|
||
{
|
||
code: 'Cohort',
|
||
name: '队列研究 (Cohort)',
|
||
description: '适用于前瞻性/回顾性队列研究,基线特征与 RCT 类似但无随机化相关偏倚评估',
|
||
baseFields: {
|
||
metadata: [
|
||
{ key: 'study_id', label: '研究标识', type: 'string', description: '第一作者+年份' },
|
||
{ key: 'study_design', label: '研究设计类型', type: 'string', description: '如 Prospective Cohort, Retrospective Cohort' },
|
||
{ key: 'funding_source', label: '资金来源', type: 'string', description: '' },
|
||
{ key: 'follow_up_duration', label: '随访时长', type: 'string', description: '如 Median 5.2 years' },
|
||
],
|
||
baseline: [
|
||
{ key: 'exposure_group', label: '暴露组', type: 'string', description: '暴露因素描述' },
|
||
{ key: 'control_group', label: '非暴露组/对照组', type: 'string', description: '' },
|
||
{ key: 'n_exposure', label: '暴露组样本量', type: 'integer', description: '' },
|
||
{ key: 'n_control', label: '对照组样本量', type: 'integer', description: '' },
|
||
{ key: 'age_exposure', label: '暴露组年龄', type: 'string', description: '' },
|
||
{ key: 'age_control', label: '对照组年龄', type: 'string', description: '' },
|
||
{ key: 'male_percent', label: '男性比例(%)', type: 'string', description: '' },
|
||
],
|
||
rob: [
|
||
{ key: 'rob_selection', label: '选择偏倚', type: 'string', description: 'NOS: 代表性、非暴露组选择、暴露确定' },
|
||
{ key: 'rob_comparability', label: '可比性', type: 'string', description: 'NOS: 混杂因素控制' },
|
||
{ key: 'rob_outcome', label: '结局评估', type: 'string', description: 'NOS: 结局评估、随访充分性' },
|
||
],
|
||
outcomes_survival: [
|
||
{ key: 'endpoint_name', label: '终点名称', type: 'string', description: '' },
|
||
{ key: 'hr_value', label: '风险比 (HR)', type: 'number', description: '调整后 HR' },
|
||
{ key: 'hr_ci_lower', label: '95%CI 下限', type: 'number', description: '' },
|
||
{ key: 'hr_ci_upper', label: '95%CI 上限', type: 'number', description: '' },
|
||
{ key: 'p_value', label: 'P 值', type: 'string', description: '' },
|
||
],
|
||
outcomes_dichotomous: [
|
||
{ key: 'event_treatment', label: '暴露组事件数', type: 'integer', description: '' },
|
||
{ key: 'total_treatment', label: '暴露组总人数', type: 'integer', description: '' },
|
||
{ key: 'event_control', label: '对照组事件数', type: 'integer', description: '' },
|
||
{ key: 'total_control', label: '对照组总人数', type: 'integer', description: '' },
|
||
],
|
||
outcomes_continuous: [
|
||
{ key: 'mean_treatment', label: '暴露组均值', type: 'number', description: '' },
|
||
{ key: 'sd_treatment', label: '暴露组标准差', type: 'number', description: '' },
|
||
{ key: 'n_treatment_outcome', label: '暴露组分析人数', type: 'integer', description: '' },
|
||
{ key: 'mean_control', label: '对照组均值', type: 'number', description: '' },
|
||
{ key: 'sd_control', label: '对照组标准差', type: 'number', description: '' },
|
||
{ key: 'n_control_outcome', label: '对照组分析人数', type: 'integer', description: '' },
|
||
],
|
||
},
|
||
},
|
||
{
|
||
code: 'QC',
|
||
name: '质量改进研究 (QI/QC)',
|
||
description: '适用于质量改进研究,关注干预前后的指标变化,偏倚评估采用 ROBINS-I 简化版',
|
||
baseFields: {
|
||
metadata: [
|
||
{ key: 'study_id', label: '研究标识', type: 'string', description: '第一作者+年份' },
|
||
{ key: 'study_design', label: '研究设计类型', type: 'string', description: '如 Before-After, ITS' },
|
||
{ key: 'setting', label: '研究场景', type: 'string', description: '如 ICU, Emergency Department' },
|
||
{ key: 'funding_source', label: '资金来源', type: 'string', description: '' },
|
||
],
|
||
baseline: [
|
||
{ key: 'intervention_name', label: 'QI 干预措施', type: 'string', description: '质量改进措施描述' },
|
||
{ key: 'comparator', label: '对照/基线', type: 'string', description: '如 Pre-intervention period' },
|
||
{ key: 'n_intervention', label: '干预组样本量', type: 'integer', description: '' },
|
||
{ key: 'n_comparator', label: '对照组样本量', type: 'integer', description: '' },
|
||
{ key: 'duration_pre', label: '干预前观察期', type: 'string', description: '' },
|
||
{ key: 'duration_post', label: '干预后观察期', type: 'string', description: '' },
|
||
],
|
||
rob: [
|
||
{ key: 'rob_confounding', label: '混杂偏倚', type: 'string', description: 'ROBINS-I 简化' },
|
||
{ key: 'rob_measurement', label: '测量偏倚', type: 'string', description: '结局指标测量方法是否一致' },
|
||
{ key: 'rob_reporting', label: '报告偏倚', type: 'string', description: '是否选择性报告' },
|
||
],
|
||
outcomes_dichotomous: [
|
||
{ key: 'event_treatment', label: '干预后事件数', type: 'integer', description: '' },
|
||
{ key: 'total_treatment', label: '干预后总人数', type: 'integer', description: '' },
|
||
{ key: 'event_control', label: '干预前事件数', type: 'integer', description: '' },
|
||
{ key: 'total_control', label: '干预前总人数', type: 'integer', description: '' },
|
||
],
|
||
outcomes_continuous: [
|
||
{ key: 'mean_treatment', label: '干预后均值', type: 'number', description: '' },
|
||
{ key: 'sd_treatment', label: '干预后标准差', type: 'number', description: '' },
|
||
{ key: 'n_treatment_outcome', label: '干预后分析人数', type: 'integer', description: '' },
|
||
{ key: 'mean_control', label: '干预前均值', type: 'number', description: '' },
|
||
{ key: 'sd_control', label: '干预前标准差', type: 'number', description: '' },
|
||
{ key: 'n_control_outcome', label: '干预前分析人数', type: 'integer', description: '' },
|
||
],
|
||
},
|
||
},
|
||
];
|
||
|
||
async function main() {
|
||
console.log('🌱 ASL 工具 3:注入系统内置提取模板...\n');
|
||
|
||
for (const template of SYSTEM_TEMPLATES) {
|
||
const result = await prisma.aslExtractionTemplate.upsert({
|
||
where: { code: template.code },
|
||
update: {
|
||
name: template.name,
|
||
description: template.description,
|
||
baseFields: template.baseFields,
|
||
},
|
||
create: {
|
||
code: template.code,
|
||
name: template.name,
|
||
description: template.description,
|
||
baseFields: template.baseFields,
|
||
isSystem: true,
|
||
},
|
||
});
|
||
console.log(` ✅ ${result.code}: ${result.name}`);
|
||
}
|
||
|
||
const count = await prisma.aslExtractionTemplate.count();
|
||
console.log(`\n🎉 完成!共 ${count} 套系统模板。`);
|
||
}
|
||
|
||
main()
|
||
.then(() => prisma.$disconnect())
|
||
.catch(async (e) => {
|
||
console.error('❌ Seed 失败:', e);
|
||
await prisma.$disconnect();
|
||
process.exit(1);
|
||
});
|