feat(asl): Implement full-text screening core LLM service and validation system (Day 1-3)

Core Components:
- PDFStorageService with Dify/OSS adapters
- LLM12FieldsService with Nougat-first + dual-model + 3-layer JSON parsing
- PromptBuilder for dynamic prompt assembly
- MedicalLogicValidator with 5 rules + fault tolerance
- EvidenceChainValidator for citation integrity
- ConflictDetectionService for dual-model comparison

Prompt Engineering:
- System Prompt (6601 chars, Section-Aware strategy)
- User Prompt template (PICOS context injection)
- JSON Schema (12 fields constraints)
- Cochrane standards (not loaded in MVP)

Key Innovations:
- 3-layer JSON parsing (JSON.parse + json-repair + code block extraction)
- Promise.allSettled for dual-model fault tolerance
- safeGetFieldValue for robust field extraction
- Mixed CN/EN token calculation

Integration Tests:
- integration-test.ts (full test)
- quick-test.ts (quick test)
- cached-result-test.ts (fault tolerance test)

Documentation Updates:
- Development record (Day 2-3 summary)
- Quality assurance strategy (full-text screening)
- Development plan (progress update)
- Module status (v1.1 update)
- Technical debt (10 new items)

Test Results:
- JSON parsing success rate: 100%
- Medical logic validation: 5/5 passed
- Dual-model parallel processing: OK
- Cost per PDF: CNY 0.10

Files: 238 changed, 14383 insertions(+), 32 deletions(-)
Docs: docs/03-涓氬姟妯″潡/ASL-AI鏅鸿兘鏂囩尞/05-寮€鍙戣褰?2025-11-22_Day2-Day3_LLM鏈嶅姟涓庨獙璇佺郴缁熷紑鍙?md
This commit is contained in:
2025-11-22 22:18:17 +08:00
parent 8eef9e0544
commit beb7f7f559
238 changed files with 20718 additions and 31 deletions

View File

@@ -0,0 +1,546 @@
import { logger } from '../../../../common/logging/index.js';
import { LLMFactory } from '../../../../common/llm/adapters/LLMFactory.js';
import { ILLMAdapter, ModelType } from '../../../../common/llm/adapters/types.js';
import { cache } from '../../../../common/cache/index.js';
import { PromptBuilder, PICOSContext, DEFAULT_MVP_CONFIG } from './PromptBuilder.js';
import { ExtractionClient } from '../../../../common/document/ExtractionClient.js';
import { calculateTokens } from '../utils/tokenCalculator.js';
import { jsonrepair } from 'jsonrepair';
import * as crypto from 'crypto';
/**
* 模型名称映射从用户友好的名称映射到内部ModelType
* 与标题摘要初筛保持一致
*/
const MODEL_NAME_MAP: Record<string, ModelType> = {
'deepseek-chat': 'deepseek-v3',
'deepseek-v3': 'deepseek-v3',
'qwen-max': 'qwen3-72b', // ⭐ qwen-max = Qwen最新最强模型
'qwen-plus': 'qwen3-72b', // qwen-plus = Qwen2.5-72B (次选)
'qwen3-72b': 'qwen3-72b',
'qwen-long': 'qwen-long',
'gpt-4o': 'gpt-5', // ⭐ gpt-4o 映射到 gpt-5
'gpt-5-pro': 'gpt-5',
'gpt-5': 'gpt-5',
'claude-sonnet-4.5': 'claude-4.5', // ⭐ claude-sonnet-4.5 映射
'claude-sonnet-4-5-20250929': 'claude-4.5',
'claude-4.5': 'claude-4.5',
};
/**
* LLM处理模式
*/
export enum LLM12FieldsMode {
SCREENING = '12fields-screening', // 评估模式(全文复筛)
EXTRACTION = '12fields-extraction', // 提取模式(全文提取,未来)
}
/**
* LLM处理结果
*/
export interface LLMResult {
result: any; // 解析后的JSON结果
processingTime: number; // 处理时间(毫秒)
tokenUsage: number; // Token使用量
cost: number; // 成本(人民币)
extractionMethod: string; // 'nougat' | 'pymupdf'
structuredFormat: boolean; // 是否为结构化格式Markdown
rawResponse: string; // 原始响应(用于调试)
}
/**
* Nougat提取选项
*/
interface NougatExtractionOptions {
preferNougat: boolean; // 是否优先使用Nougat英文论文
nougatQualityThreshold: number; // Nougat质量阈值0.0-1.0低于此值降级到PyMuPDF
}
/**
* LLM 12字段处理服务
*
* 功能:
* 1. 全文提取Nougat优先
* 2. Prompt动态组装
* 3. LLM调用支持DeepSeek-V3、Qwen3-Max等
* 4. 结果缓存
* 5. 双模型并行调用
*/
export class LLM12FieldsService {
private promptBuilder: PromptBuilder;
private extractionClient: ExtractionClient;
private nougatOptions: NougatExtractionOptions;
constructor(options?: {
promptBuilder?: PromptBuilder;
extractionClient?: ExtractionClient;
nougatOptions?: Partial<NougatExtractionOptions>;
}) {
this.promptBuilder = options?.promptBuilder || new PromptBuilder();
this.extractionClient = options?.extractionClient || new ExtractionClient();
this.nougatOptions = {
preferNougat: true,
nougatQualityThreshold: 0.8,
...options?.nougatOptions,
};
}
/**
* 处理12字段screening or extraction
*
* 策略全文一次性输入通过Prompt工程优化
*/
async process12Fields(
mode: LLM12FieldsMode,
model: string, // 'deepseek-v3' | 'qwen-max' | 'deepseek-chat' 等用户友好名称
pdfBuffer: Buffer,
filename: string,
picosContext: PICOSContext
): Promise<LLMResult> {
const startTime = Date.now();
logger.info(`Starting 12-fields processing with model: ${model}, mode: ${mode}`);
// 映射模型名称到ModelType
const modelType = MODEL_NAME_MAP[model];
if (!modelType) {
throw new Error(
`Unsupported model name: ${model}. Supported models: ${Object.keys(MODEL_NAME_MAP).join(', ')}`
);
}
// Step 1: 提取全文Nougat优先
const { fullTextMarkdown, extractionMethod, structuredFormat } =
await this.extractFullTextStructured(pdfBuffer, filename);
logger.info(
`Full-text extracted, method: ${extractionMethod}, structured: ${structuredFormat}, length: ${fullTextMarkdown.length} chars`
);
// Step 2: 检查缓存
const cacheKey = this.generateCacheKey(mode, model, fullTextMarkdown, picosContext);
const cached = await this.checkCache(cacheKey);
if (cached) {
logger.info('Cache hit, returning cached result');
return cached;
}
// Step 3: 构建Prompt
const { systemPrompt, userPrompt } = await this.promptBuilder.buildFullPrompt({
picosContext,
fullTextContent: fullTextMarkdown,
documentFormat: structuredFormat ? 'markdown' : 'plaintext',
estimatedWordCount: Math.floor(fullTextMarkdown.length / 1.5), // 粗略估算字数
modelName: model,
includeCochraneStandards: DEFAULT_MVP_CONFIG.cochraneStandards,
includeFewShotExamples: DEFAULT_MVP_CONFIG.fewShotExamples,
});
logger.info(
`Prompt built, system: ${systemPrompt.length} chars, user: ${userPrompt.length} chars`
);
// Step 4: 调用LLM
const llmAdapter = LLMFactory.getAdapter(modelType);
const llmResponse = await this.callLLMWithRetry(
llmAdapter,
systemPrompt,
userPrompt,
mode
);
// Step 5: 解析结果
const parsedResult = this.parseResponse(llmResponse);
// Step 6: 计算Token和成本
const tokenUsage = calculateTokens(systemPrompt + userPrompt + llmResponse);
const cost = this.calculateCost(model, tokenUsage);
const result: LLMResult = {
result: parsedResult,
processingTime: Date.now() - startTime,
tokenUsage,
cost,
extractionMethod,
structuredFormat,
rawResponse: llmResponse,
};
// Step 7: 缓存结果
await this.cacheResult(cacheKey, result);
logger.info(
`12-fields processing completed, time: ${result.processingTime}ms, tokens: ${tokenUsage}, cost: ¥${cost.toFixed(4)}`
);
return result;
}
/**
* 双模型并行调用(容错版本)
*
* 使用Promise.allSettled确保单个模型失败不影响另一个
*
* 容错策略:
* - 双模型成功:正常返回
* - 单模型失败:返回成功的模型结果,标记降级模式
* - 双模型失败:抛出异常
*/
async processDualModels(
mode: LLM12FieldsMode,
modelA: string = 'deepseek-v3',
modelB: string = 'qwen-max',
pdfBuffer: Buffer,
filename: string,
picosContext: PICOSContext
): Promise<{
resultA: LLMResult | null;
resultB: LLMResult | null;
degradedMode: boolean;
failedModel?: string;
}> {
logger.info(`Starting dual-model processing: ${modelA} + ${modelB}`);
// 使用allSettled确保一个失败不影响另一个
const [settledA, settledB] = await Promise.allSettled([
this.process12Fields(mode, modelA, pdfBuffer, filename, picosContext),
this.process12Fields(mode, modelB, pdfBuffer, filename, picosContext),
]);
// 提取结果
const resultA = settledA.status === 'fulfilled' ? settledA.value : null;
const resultB = settledB.status === 'fulfilled' ? settledB.value : null;
// ========================================
// 容错逻辑
// ========================================
// 情况1双模型都失败 ❌
if (!resultA && !resultB) {
const errorA = settledA.status === 'rejected' ? settledA.reason : 'unknown';
const errorB = settledB.status === 'rejected' ? settledB.reason : 'unknown';
logger.error('Both models failed', {
modelA,
modelB,
errorA: errorA?.message || String(errorA),
errorB: errorB?.message || String(errorB)
});
throw new Error(
`Both models (${modelA} and ${modelB}) failed to process. ` +
`${modelA} error: ${errorA?.message || errorA}. ` +
`${modelB} error: ${errorB?.message || errorB}.`
);
}
// 情况2模型A失败使用模型B ⚠️
if (!resultA && resultB) {
const errorA = settledA.status === 'rejected' ? settledA.reason : 'unknown';
logger.warn(`Model ${modelA} failed, using ${modelB} only (degraded mode)`, {
failedModel: modelA,
error: errorA?.message || String(errorA),
successModelCost: resultB.cost
});
return {
resultA: null,
resultB,
degradedMode: true,
failedModel: modelA
};
}
// 情况3模型B失败使用模型A ⚠️
if (resultA && !resultB) {
const errorB = settledB.status === 'rejected' ? settledB.reason : 'unknown';
logger.warn(`Model ${modelB} failed, using ${modelA} only (degraded mode)`, {
failedModel: modelB,
error: errorB?.message || String(errorB),
successModelCost: resultA.cost
});
return {
resultA,
resultB: null,
degradedMode: true,
failedModel: modelB
};
}
// 情况4双模型都成功 ✅
logger.info(
`Dual-model processing completed successfully, total cost: ¥${(resultA!.cost + resultB!.cost).toFixed(4)}`
);
return {
resultA,
resultB,
degradedMode: false
};
}
/**
* 提取全文Nougat优先策略
*/
private async extractFullTextStructured(
pdfBuffer: Buffer,
filename: string
): Promise<{
fullTextMarkdown: string;
extractionMethod: 'nougat' | 'pymupdf';
structuredFormat: boolean;
}> {
logger.info('Extracting full-text with Nougat-first strategy...');
// Step 1: 检测语言通过Python microservice
// 注意这里简化了实际可能需要先用PyMuPDF提取少量文本检测语言
// 为了性能我们直接尝试Nougat失败则降级
// Step 2: 优先尝试Nougat英文论文效果最好
if (this.nougatOptions.preferNougat) {
try {
const nougatResult = await this.extractionClient.extractPdf(pdfBuffer, filename);
// 检查Nougat质量
if (
nougatResult.method === 'nougat' &&
(nougatResult.quality || 0) >= this.nougatOptions.nougatQualityThreshold
) {
logger.info('✅ Using Nougat extraction (structured Markdown)');
return {
fullTextMarkdown: nougatResult.text,
extractionMethod: 'nougat',
structuredFormat: true, // Nougat输出Markdown
};
} else {
logger.warn(
`⚠️ Nougat quality too low (${nougatResult.quality}), falling back to PyMuPDF`
);
}
} catch (error) {
logger.warn(`⚠️ Nougat extraction failed: ${(error as Error).message}, falling back to PyMuPDF`);
}
}
// Step 3: 降级使用PyMuPDF
logger.info('Using PyMuPDF extraction (plaintext)');
const pymupdfResult = await this.extractionClient.extractPdf(pdfBuffer, filename);
return {
fullTextMarkdown: pymupdfResult.text,
extractionMethod: 'pymupdf',
structuredFormat: false, // PyMuPDF输出纯文本
};
}
/**
* 调用LLM带重试
*/
private async callLLMWithRetry(
adapter: ILLMAdapter,
systemPrompt: string,
userPrompt: string,
_mode: LLM12FieldsMode,
maxRetries: number = 2
): Promise<string> {
let lastError: Error | null = null;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
logger.info(`LLM call attempt ${attempt + 1}/${maxRetries + 1}`);
const response = await adapter.chat(
[
{ role: 'system', content: systemPrompt },
{ role: 'user', content: userPrompt },
],
{
temperature: 0.1, // 低温度,提高一致性
maxTokens: 8000, // 足够输出12字段+处理日志
}
);
return response.content;
} catch (error) {
lastError = error as Error;
logger.error(`LLM call attempt ${attempt + 1} failed: ${(error as Error).message}`);
if (attempt < maxRetries) {
// 指数退避
const waitTime = Math.pow(2, attempt) * 1000;
logger.info(`Retrying in ${waitTime}ms...`);
await new Promise((resolve) => setTimeout(resolve, waitTime));
}
}
}
throw new Error(`LLM call failed after ${maxRetries + 1} attempts: ${lastError?.message}`);
}
/**
* 解析LLM响应3层容错策略
*
* Layer 1: 严格JSON解析
* Layer 2: JSON自动修复jsonrepair
* Layer 3: 提取代码块并解析
*/
private parseResponse(response: string): any {
// ========================================
// Layer 1: 严格JSON解析
// ========================================
try {
const result = JSON.parse(response);
logger.info('JSON parsed successfully (Layer 1: strict)');
return result;
} catch (layer1Error) {
logger.warn('Layer 1 failed: strict JSON parsing failed, trying Layer 2...');
}
// ========================================
// Layer 2: JSON自动修复
// ========================================
try {
const repaired = jsonrepair(response);
const result = JSON.parse(repaired);
logger.warn('JSON auto-repaired (Layer 2)', {
originalLength: response.length,
repairedLength: repaired.length,
message: 'LLM output had format issues, auto-repaired successfully'
});
return result;
} catch (layer2Error) {
logger.warn('Layer 2 failed: JSON repair failed, trying Layer 3...');
}
// ========================================
// Layer 3: 提取代码块
// ========================================
let layer3Error: Error | null = null;
try {
// 匹配多种代码块格式
const patterns = [
/```json\s*\n([\s\S]*?)\n```/, // ```json ... ```
/```\s*\n([\s\S]*?)\n```/, // ``` ... ```
/\{[\s\S]*\}/, // 直接匹配 {...}
];
for (const pattern of patterns) {
const match = response.match(pattern);
if (match) {
const extracted = match[1] || match[0];
// 先尝试严格解析提取的内容
try {
const result = JSON.parse(extracted);
logger.warn('JSON extracted from code block (Layer 3)', {
pattern: pattern.source,
message: 'LLM wrapped JSON in code block'
});
return result;
} catch {
// 尝试修复提取的内容
const repaired = jsonrepair(extracted);
const result = JSON.parse(repaired);
logger.warn('JSON extracted and repaired (Layer 3)', {
pattern: pattern.source,
message: 'LLM wrapped JSON in code block with format issues'
});
return result;
}
}
}
throw new Error('No valid JSON found in response');
} catch (error) {
layer3Error = error as Error;
logger.error('All 3 layers failed to parse JSON');
}
// ========================================
// 最终失败:记录详细错误
// ========================================
const err = layer3Error || new Error('Unknown parsing error');
logger.error('Failed to parse LLM response after all 3 layers', {
error: err.message,
responsePreview: response.substring(0, 500),
responseLength: response.length
});
throw new Error(
`Invalid JSON response from LLM after 3 parsing attempts: ${err.message}. ` +
`Please check logs for response preview.`
);
}
/**
* 生成缓存Key
*/
private generateCacheKey(
mode: LLM12FieldsMode,
model: string,
fullText: string,
picosContext: PICOSContext
): string {
const hash = crypto
.createHash('sha256')
.update(fullText + JSON.stringify(picosContext))
.digest('hex')
.substring(0, 16);
return `llm:${mode}:${model}:${hash}`;
}
/**
* 检查缓存
*/
private async checkCache(cacheKey: string): Promise<LLMResult | null> {
try {
const cached = await cache.get(cacheKey);
return cached ? JSON.parse(cached) : null;
} catch (error) {
logger.warn(`Cache check failed: ${(error as Error).message}`);
return null;
}
}
/**
* 缓存结果
*/
private async cacheResult(cacheKey: string, result: LLMResult): Promise<void> {
try {
// 缓存1小时
await cache.set(cacheKey, JSON.stringify(result), 3600);
logger.info(`Result cached with key: ${cacheKey}`);
} catch (error) {
logger.warn(`Cache set failed: ${(error as Error).message}`);
}
}
/**
* 计算成本(人民币)
*/
private calculateCost(model: string, tokenUsage: number): number {
// 成本表(人民币/1K tokens
const COST_TABLE: Record<string, number> = {
'deepseek-v3': 0.001, // ¥0.001/1K tokens
'qwen-max': 0.004, // ¥0.004/1K tokens
'qwen-plus': 0.002, // ¥0.002/1K tokens
'qwen-turbo': 0.0008, // ¥0.0008/1K tokens
'gpt-4o': 0.03, // $0.005/1K tokens ≈ ¥0.03/1K tokens
'claude-3.5-sonnet': 0.02, // $0.003/1K tokens ≈ ¥0.02/1K tokens
};
const costPerK = COST_TABLE[model] || 0.01; // 默认值
return (tokenUsage / 1000) * costPerK;
}
}
/**
* 创建LLM12FieldsService单例
*/
export const llm12FieldsService = new LLM12FieldsService();