feat(asl): Implement full-text screening core LLM service and validation system (Day 1-3)

Core Components: - PDFStorageService with Dify/OSS adapters - LLM12FieldsService with Nougat-first + dual-model + 3-layer JSON parsing - PromptBuilder for dynamic prompt assembly - MedicalLogicValidator with 5 rules + fault tolerance - EvidenceChainValidator for citation integrity - ConflictDetectionService for dual-model comparison Prompt Engineering: - System Prompt (6601 chars, Section-Aware strategy) - User Prompt template (PICOS context injection) - JSON Schema (12 fields constraints) - Cochrane standards (not loaded in MVP) Key Innovations: - 3-layer JSON parsing (JSON.parse + json-repair + code block extraction) - Promise.allSettled for dual-model fault tolerance - safeGetFieldValue for robust field extraction - Mixed CN/EN token calculation Integration Tests: - integration-test.ts (full test) - quick-test.ts (quick test) - cached-result-test.ts (fault tolerance test) Documentation Updates: - Development record (Day 2-3 summary) - Quality assurance strategy (full-text screening) - Development plan (progress update) - Module status (v1.1 update) - Technical debt (10 new items) Test Results: - JSON parsing success rate: 100% - Medical logic validation: 5/5 passed - Dual-model parallel processing: OK - Cost per PDF: CNY 0.10 Files: 238 changed, 14383 insertions(+), 32 deletions(-) Docs: docs/03-涓氬姟妯″潡/ASL-AI鏅鸿兘鏂囩尞/05-寮€鍙戣褰?2025-11-22_Day2-Day3_LLM鏈嶅姟涓庨獙璇佺郴缁熷紑鍙?md
2025-11-22 22:18:17 +08:00
parent 8eef9e0544
commit beb7f7f559
238 changed files with 20718 additions and 31 deletions
--- a/backend/src/modules/asl/common/llm/LLM12FieldsService.ts
+++ b/backend/src/modules/asl/common/llm/LLM12FieldsService.ts
@@ -0,0 +1,546 @@
+import { logger } from '../../../../common/logging/index.js';
+import { LLMFactory } from '../../../../common/llm/adapters/LLMFactory.js';
+import { ILLMAdapter, ModelType } from '../../../../common/llm/adapters/types.js';
+import { cache } from '../../../../common/cache/index.js';
+import { PromptBuilder, PICOSContext, DEFAULT_MVP_CONFIG } from './PromptBuilder.js';
+import { ExtractionClient } from '../../../../common/document/ExtractionClient.js';
+import { calculateTokens } from '../utils/tokenCalculator.js';
+import { jsonrepair } from 'jsonrepair';
+import * as crypto from 'crypto';
+
+/**
+ * 模型名称映射：从用户友好的名称映射到内部ModelType
+ * 与标题摘要初筛保持一致
+ */
+const MODEL_NAME_MAP: Record<string, ModelType> = {
+  'deepseek-chat': 'deepseek-v3',
+  'deepseek-v3': 'deepseek-v3',
+  'qwen-max': 'qwen3-72b',        // ⭐ qwen-max = Qwen最新最强模型
+  'qwen-plus': 'qwen3-72b',       // qwen-plus = Qwen2.5-72B (次选)
+  'qwen3-72b': 'qwen3-72b',
+  'qwen-long': 'qwen-long',
+  'gpt-4o': 'gpt-5',              // ⭐ gpt-4o 映射到 gpt-5
+  'gpt-5-pro': 'gpt-5',
+  'gpt-5': 'gpt-5',
+  'claude-sonnet-4.5': 'claude-4.5',  // ⭐ claude-sonnet-4.5 映射
+  'claude-sonnet-4-5-20250929': 'claude-4.5',
+  'claude-4.5': 'claude-4.5',
+};
+
+/**
+ * LLM处理模式
+ */
+export enum LLM12FieldsMode {
+  SCREENING = '12fields-screening', // 评估模式（全文复筛）
+  EXTRACTION = '12fields-extraction', // 提取模式（全文提取，未来）
+}
+
+/**
+ * LLM处理结果
+ */
+export interface LLMResult {
+  result: any; // 解析后的JSON结果
+  processingTime: number; // 处理时间（毫秒）
+  tokenUsage: number; // Token使用量
+  cost: number; // 成本（人民币）
+  extractionMethod: string; // 'nougat' | 'pymupdf'
+  structuredFormat: boolean; // 是否为结构化格式（Markdown）
+  rawResponse: string; // 原始响应（用于调试）
+}
+
+/**
+ * Nougat提取选项
+ */
+interface NougatExtractionOptions {
+  preferNougat: boolean; // 是否优先使用Nougat（英文论文）
+  nougatQualityThreshold: number; // Nougat质量阈值（0.0-1.0，低于此值降级到PyMuPDF）
+}
+
+/**
+ * LLM 12字段处理服务
+ * 
+ * 功能：
+ * 1. 全文提取（Nougat优先）
+ * 2. Prompt动态组装
+ * 3. LLM调用（支持DeepSeek-V3、Qwen3-Max等）
+ * 4. 结果缓存
+ * 5. 双模型并行调用
+ */
+export class LLM12FieldsService {
+  private promptBuilder: PromptBuilder;
+  private extractionClient: ExtractionClient;
+  private nougatOptions: NougatExtractionOptions;
+
+  constructor(options?: {
+    promptBuilder?: PromptBuilder;
+    extractionClient?: ExtractionClient;
+    nougatOptions?: Partial<NougatExtractionOptions>;
+  }) {
+    this.promptBuilder = options?.promptBuilder || new PromptBuilder();
+    this.extractionClient = options?.extractionClient || new ExtractionClient();
+    this.nougatOptions = {
+      preferNougat: true,
+      nougatQualityThreshold: 0.8,
+      ...options?.nougatOptions,
+    };
+  }
+
+  /**
+   * 处理12字段（screening or extraction）
+   * 
+   * 策略：全文一次性输入，通过Prompt工程优化
+   */
+  async process12Fields(
+    mode: LLM12FieldsMode,
+    model: string, // 'deepseek-v3' | 'qwen-max' | 'deepseek-chat' 等用户友好名称
+    pdfBuffer: Buffer,
+    filename: string,
+    picosContext: PICOSContext
+  ): Promise<LLMResult> {
+    const startTime = Date.now();
+    logger.info(`Starting 12-fields processing with model: ${model}, mode: ${mode}`);
+
+    // 映射模型名称到ModelType
+    const modelType = MODEL_NAME_MAP[model];
+    if (!modelType) {
+      throw new Error(
+        `Unsupported model name: ${model}. Supported models: ${Object.keys(MODEL_NAME_MAP).join(', ')}`
+      );
+    }
+
+    // Step 1: 提取全文（Nougat优先）
+    const { fullTextMarkdown, extractionMethod, structuredFormat } =
+      await this.extractFullTextStructured(pdfBuffer, filename);
+
+    logger.info(
+      `Full-text extracted, method: ${extractionMethod}, structured: ${structuredFormat}, length: ${fullTextMarkdown.length} chars`
+    );
+
+    // Step 2: 检查缓存
+    const cacheKey = this.generateCacheKey(mode, model, fullTextMarkdown, picosContext);
+    const cached = await this.checkCache(cacheKey);
+    if (cached) {
+      logger.info('Cache hit, returning cached result');
+      return cached;
+    }
+
+    // Step 3: 构建Prompt
+    const { systemPrompt, userPrompt } = await this.promptBuilder.buildFullPrompt({
+      picosContext,
+      fullTextContent: fullTextMarkdown,
+      documentFormat: structuredFormat ? 'markdown' : 'plaintext',
+      estimatedWordCount: Math.floor(fullTextMarkdown.length / 1.5), // 粗略估算字数
+      modelName: model,
+      includeCochraneStandards: DEFAULT_MVP_CONFIG.cochraneStandards,
+      includeFewShotExamples: DEFAULT_MVP_CONFIG.fewShotExamples,
+    });
+
+    logger.info(
+      `Prompt built, system: ${systemPrompt.length} chars, user: ${userPrompt.length} chars`
+    );
+
+    // Step 4: 调用LLM
+    const llmAdapter = LLMFactory.getAdapter(modelType);
+    const llmResponse = await this.callLLMWithRetry(
+      llmAdapter,
+      systemPrompt,
+      userPrompt,
+      mode
+    );
+
+    // Step 5: 解析结果
+    const parsedResult = this.parseResponse(llmResponse);
+
+    // Step 6: 计算Token和成本
+    const tokenUsage = calculateTokens(systemPrompt + userPrompt + llmResponse);
+    const cost = this.calculateCost(model, tokenUsage);
+
+    const result: LLMResult = {
+      result: parsedResult,
+      processingTime: Date.now() - startTime,
+      tokenUsage,
+      cost,
+      extractionMethod,
+      structuredFormat,
+      rawResponse: llmResponse,
+    };
+
+    // Step 7: 缓存结果
+    await this.cacheResult(cacheKey, result);
+
+    logger.info(
+      `12-fields processing completed, time: ${result.processingTime}ms, tokens: ${tokenUsage}, cost: ¥${cost.toFixed(4)}`
+    );
+
+    return result;
+  }
+
+  /**
+   * 双模型并行调用（容错版本）
+   * 
+   * 使用Promise.allSettled确保单个模型失败不影响另一个
+   * 
+   * 容错策略：
+   * - 双模型成功：正常返回
+   * - 单模型失败：返回成功的模型结果，标记降级模式
+   * - 双模型失败：抛出异常
+   */
+  async processDualModels(
+    mode: LLM12FieldsMode,
+    modelA: string = 'deepseek-v3',
+    modelB: string = 'qwen-max',
+    pdfBuffer: Buffer,
+    filename: string,
+    picosContext: PICOSContext
+  ): Promise<{ 
+    resultA: LLMResult | null; 
+    resultB: LLMResult | null;
+    degradedMode: boolean;
+    failedModel?: string;
+  }> {
+    logger.info(`Starting dual-model processing: ${modelA} + ${modelB}`);
+
+    // 使用allSettled确保一个失败不影响另一个
+    const [settledA, settledB] = await Promise.allSettled([
+      this.process12Fields(mode, modelA, pdfBuffer, filename, picosContext),
+      this.process12Fields(mode, modelB, pdfBuffer, filename, picosContext),
+    ]);
+
+    // 提取结果
+    const resultA = settledA.status === 'fulfilled' ? settledA.value : null;
+    const resultB = settledB.status === 'fulfilled' ? settledB.value : null;
+
+    // ========================================
+    // 容错逻辑
+    // ========================================
+
+    // 情况1：双模型都失败 ❌
+    if (!resultA && !resultB) {
+      const errorA = settledA.status === 'rejected' ? settledA.reason : 'unknown';
+      const errorB = settledB.status === 'rejected' ? settledB.reason : 'unknown';
+      
+      logger.error('Both models failed', {
+        modelA,
+        modelB,
+        errorA: errorA?.message || String(errorA),
+        errorB: errorB?.message || String(errorB)
+      });
+
+      throw new Error(
+        `Both models (${modelA} and ${modelB}) failed to process. ` +
+        `${modelA} error: ${errorA?.message || errorA}. ` +
+        `${modelB} error: ${errorB?.message || errorB}.`
+      );
+    }
+
+    // 情况2：模型A失败，使用模型B ⚠️
+    if (!resultA && resultB) {
+      const errorA = settledA.status === 'rejected' ? settledA.reason : 'unknown';
+      
+      logger.warn(`Model ${modelA} failed, using ${modelB} only (degraded mode)`, {
+        failedModel: modelA,
+        error: errorA?.message || String(errorA),
+        successModelCost: resultB.cost
+      });
+
+      return {
+        resultA: null,
+        resultB,
+        degradedMode: true,
+        failedModel: modelA
+      };
+    }
+
+    // 情况3：模型B失败，使用模型A ⚠️
+    if (resultA && !resultB) {
+      const errorB = settledB.status === 'rejected' ? settledB.reason : 'unknown';
+      
+      logger.warn(`Model ${modelB} failed, using ${modelA} only (degraded mode)`, {
+        failedModel: modelB,
+        error: errorB?.message || String(errorB),
+        successModelCost: resultA.cost
+      });
+
+      return {
+        resultA,
+        resultB: null,
+        degradedMode: true,
+        failedModel: modelB
+      };
+    }
+
+    // 情况4：双模型都成功 ✅
+    logger.info(
+      `Dual-model processing completed successfully, total cost: ¥${(resultA!.cost + resultB!.cost).toFixed(4)}`
+    );
+
+    return { 
+      resultA, 
+      resultB,
+      degradedMode: false
+    };
+  }
+
+  /**
+   * 提取全文（Nougat优先策略）
+   */
+  private async extractFullTextStructured(
+    pdfBuffer: Buffer,
+    filename: string
+  ): Promise<{
+    fullTextMarkdown: string;
+    extractionMethod: 'nougat' | 'pymupdf';
+    structuredFormat: boolean;
+  }> {
+    logger.info('Extracting full-text with Nougat-first strategy...');
+
+    // Step 1: 检测语言（通过Python microservice）
+    // 注意：这里简化了，实际可能需要先用PyMuPDF提取少量文本检测语言
+    // 为了性能，我们直接尝试Nougat，失败则降级
+
+    // Step 2: 优先尝试Nougat（英文论文效果最好）
+    if (this.nougatOptions.preferNougat) {
+      try {
+        const nougatResult = await this.extractionClient.extractPdf(pdfBuffer, filename);
+
+        // 检查Nougat质量
+        if (
+          nougatResult.method === 'nougat' &&
+          (nougatResult.quality || 0) >= this.nougatOptions.nougatQualityThreshold
+        ) {
+          logger.info('✅ Using Nougat extraction (structured Markdown)');
+          return {
+            fullTextMarkdown: nougatResult.text,
+            extractionMethod: 'nougat',
+            structuredFormat: true, // Nougat输出Markdown
+          };
+        } else {
+          logger.warn(
+            `⚠️ Nougat quality too low (${nougatResult.quality}), falling back to PyMuPDF`
+          );
+        }
+      } catch (error) {
+        logger.warn(`⚠️ Nougat extraction failed: ${(error as Error).message}, falling back to PyMuPDF`);
+      }
+    }
+
+    // Step 3: 降级使用PyMuPDF
+    logger.info('Using PyMuPDF extraction (plaintext)');
+    const pymupdfResult = await this.extractionClient.extractPdf(pdfBuffer, filename);
+
+    return {
+      fullTextMarkdown: pymupdfResult.text,
+      extractionMethod: 'pymupdf',
+      structuredFormat: false, // PyMuPDF输出纯文本
+    };
+  }
+
+  /**
+   * 调用LLM（带重试）
+   */
+  private async callLLMWithRetry(
+    adapter: ILLMAdapter,
+    systemPrompt: string,
+    userPrompt: string,
+    _mode: LLM12FieldsMode,
+    maxRetries: number = 2
+  ): Promise<string> {
+    let lastError: Error | null = null;
+
+    for (let attempt = 0; attempt <= maxRetries; attempt++) {
+      try {
+        logger.info(`LLM call attempt ${attempt + 1}/${maxRetries + 1}`);
+
+        const response = await adapter.chat(
+          [
+            { role: 'system', content: systemPrompt },
+            { role: 'user', content: userPrompt },
+          ],
+          {
+            temperature: 0.1, // 低温度，提高一致性
+            maxTokens: 8000, // 足够输出12字段+处理日志
+          }
+        );
+
+        return response.content;
+      } catch (error) {
+        lastError = error as Error;
+        logger.error(`LLM call attempt ${attempt + 1} failed: ${(error as Error).message}`);
+
+        if (attempt < maxRetries) {
+          // 指数退避
+          const waitTime = Math.pow(2, attempt) * 1000;
+          logger.info(`Retrying in ${waitTime}ms...`);
+          await new Promise((resolve) => setTimeout(resolve, waitTime));
+        }
+      }
+    }
+
+    throw new Error(`LLM call failed after ${maxRetries + 1} attempts: ${lastError?.message}`);
+  }
+
+  /**
+   * 解析LLM响应（3层容错策略）
+   * 
+   * Layer 1: 严格JSON解析
+   * Layer 2: JSON自动修复（jsonrepair）
+   * Layer 3: 提取代码块并解析
+   */
+  private parseResponse(response: string): any {
+    // ========================================
+    // Layer 1: 严格JSON解析
+    // ========================================
+    try {
+      const result = JSON.parse(response);
+      logger.info('JSON parsed successfully (Layer 1: strict)');
+      return result;
+    } catch (layer1Error) {
+      logger.warn('Layer 1 failed: strict JSON parsing failed, trying Layer 2...');
+    }
+
+    // ========================================
+    // Layer 2: JSON自动修复
+    // ========================================
+    try {
+      const repaired = jsonrepair(response);
+      const result = JSON.parse(repaired);
+      
+      logger.warn('JSON auto-repaired (Layer 2)', {
+        originalLength: response.length,
+        repairedLength: repaired.length,
+        message: 'LLM output had format issues, auto-repaired successfully'
+      });
+      
+      return result;
+    } catch (layer2Error) {
+      logger.warn('Layer 2 failed: JSON repair failed, trying Layer 3...');
+    }
+
+    // ========================================
+    // Layer 3: 提取代码块
+    // ========================================
+    let layer3Error: Error | null = null;
+    try {
+      // 匹配多种代码块格式
+      const patterns = [
+        /```json\s*\n([\s\S]*?)\n```/,   // ```json ... ```
+        /```\s*\n([\s\S]*?)\n```/,       // ``` ... ```
+        /\{[\s\S]*\}/,                   // 直接匹配 {...}
+      ];
+
+      for (const pattern of patterns) {
+        const match = response.match(pattern);
+        if (match) {
+          const extracted = match[1] || match[0];
+          
+          // 先尝试严格解析提取的内容
+          try {
+            const result = JSON.parse(extracted);
+            logger.warn('JSON extracted from code block (Layer 3)', {
+              pattern: pattern.source,
+              message: 'LLM wrapped JSON in code block'
+            });
+            return result;
+          } catch {
+            // 尝试修复提取的内容
+            const repaired = jsonrepair(extracted);
+            const result = JSON.parse(repaired);
+            logger.warn('JSON extracted and repaired (Layer 3)', {
+              pattern: pattern.source,
+              message: 'LLM wrapped JSON in code block with format issues'
+            });
+            return result;
+          }
+        }
+      }
+
+      throw new Error('No valid JSON found in response');
+    } catch (error) {
+      layer3Error = error as Error;
+      logger.error('All 3 layers failed to parse JSON');
+    }
+
+    // ========================================
+    // 最终失败：记录详细错误
+    // ========================================
+    const err = layer3Error || new Error('Unknown parsing error');
+    logger.error('Failed to parse LLM response after all 3 layers', {
+      error: err.message,
+      responsePreview: response.substring(0, 500),
+      responseLength: response.length
+    });
+    
+    throw new Error(
+      `Invalid JSON response from LLM after 3 parsing attempts: ${err.message}. ` +
+      `Please check logs for response preview.`
+    );
+  }
+
+  /**
+   * 生成缓存Key
+   */
+  private generateCacheKey(
+    mode: LLM12FieldsMode,
+    model: string,
+    fullText: string,
+    picosContext: PICOSContext
+  ): string {
+    const hash = crypto
+      .createHash('sha256')
+      .update(fullText + JSON.stringify(picosContext))
+      .digest('hex')
+      .substring(0, 16);
+
+    return `llm:${mode}:${model}:${hash}`;
+  }
+
+  /**
+   * 检查缓存
+   */
+  private async checkCache(cacheKey: string): Promise<LLMResult | null> {
+    try {
+      const cached = await cache.get(cacheKey);
+      return cached ? JSON.parse(cached) : null;
+    } catch (error) {
+      logger.warn(`Cache check failed: ${(error as Error).message}`);
+      return null;
+    }
+  }
+
+  /**
+   * 缓存结果
+   */
+  private async cacheResult(cacheKey: string, result: LLMResult): Promise<void> {
+    try {
+      // 缓存1小时
+      await cache.set(cacheKey, JSON.stringify(result), 3600);
+      logger.info(`Result cached with key: ${cacheKey}`);
+    } catch (error) {
+      logger.warn(`Cache set failed: ${(error as Error).message}`);
+    }
+  }
+
+  /**
+   * 计算成本（人民币）
+   */
+  private calculateCost(model: string, tokenUsage: number): number {
+    // 成本表（人民币/1K tokens）
+    const COST_TABLE: Record<string, number> = {
+      'deepseek-v3': 0.001, // ¥0.001/1K tokens
+      'qwen-max': 0.004, // ¥0.004/1K tokens
+      'qwen-plus': 0.002, // ¥0.002/1K tokens
+      'qwen-turbo': 0.0008, // ¥0.0008/1K tokens
+      'gpt-4o': 0.03, // $0.005/1K tokens ≈ ¥0.03/1K tokens
+      'claude-3.5-sonnet': 0.02, // $0.003/1K tokens ≈ ¥0.02/1K tokens
+    };
+
+    const costPerK = COST_TABLE[model] || 0.01; // 默认值
+    return (tokenUsage / 1000) * costPerK;
+  }
+}
+
+/**
+ * 创建LLM12FieldsService单例
+ */
+export const llm12FieldsService = new LLM12FieldsService();
+