refactor(backend): incremental architecture evolution (Task 19)

- Add common/ layer for shared capabilities (LLM, RAG, document, middleware) - Add legacy/ layer for existing business code - Move files to new structure (controllers, routes, services) - Update index.ts for new route registration - System remains fully functional
2025-11-16 15:42:44 +08:00
parent 8a17dc80ae
commit 0c5310fb77
39 changed files with 3904 additions and 353 deletions
--- a/backend/src/common/document/ExtractionClient.ts
+++ b/backend/src/common/document/ExtractionClient.ts
@@ -0,0 +1,272 @@
+import FormData from 'form-data';
+import axios from 'axios';
+
+/**
+ * Extraction Service Client
+ * 调用Python微服务进行文档提取
+ */
+
+const EXTRACTION_SERVICE_URL = process.env.EXTRACTION_SERVICE_URL || 'http://localhost:8000';
+
+export interface ExtractionResult {
+  success: boolean;
+  method: string; // pymupdf/nougat/mammoth/direct
+  text: string;
+  quality?: number;
+  encoding?: string;
+  language?: string;
+  metadata: {
+    filename: string;
+    char_count?: number;
+    line_count?: number;
+    file_size?: number;
+    page_count?: number;
+    has_tables?: boolean;
+    [key: string]: any;
+  };
+  error?: string;
+}
+
+class ExtractionClient {
+  private baseUrl: string;
+
+  constructor(baseUrl: string = EXTRACTION_SERVICE_URL) {
+    this.baseUrl = baseUrl;
+  }
+
+  /**
+   * 健康检查
+   */
+  async health(): Promise<{
+    status: string;
+    checks: any;
+    timestamp: string;
+  }> {
+    try {
+      const response = await axios.get(`${this.baseUrl}/api/health`);
+      return response.data;
+    } catch (error) {
+      console.error('[ExtractionClient] Health check failed:', error);
+      throw new Error('Extraction service is unavailable');
+    }
+  }
+
+  /**
+   * 通用文档提取接口
+   * 自动检测文件类型并调用相应的提取方法
+   */
+  async extractDocument(
+    file: Buffer,
+    filename: string
+  ): Promise<ExtractionResult> {
+    try {
+      const formData = new FormData();
+      formData.append('file', file, filename);
+
+      const response = await axios.post<ExtractionResult>(
+        `${this.baseUrl}/api/extract`,
+        formData,
+        {
+          headers: {
+            ...formData.getHeaders(),
+          },
+          timeout: 120000, // 2分钟超时
+        }
+      );
+
+      return response.data;
+    } catch (error) {
+      console.error('[ExtractionClient] Extract failed:', error);
+      
+      if (axios.isAxiosError(error) && error.response) {
+        throw new Error(`Extraction failed: ${error.response.data.detail || error.message}`);
+      }
+      
+      throw new Error('Document extraction failed');
+    }
+  }
+
+  /**
+   * PDF专用提取接口
+   */
+  async extractPdf(
+    file: Buffer,
+    filename: string,
+    method?: 'auto' | 'nougat' | 'pymupdf'
+  ): Promise<ExtractionResult> {
+    try {
+      const formData = new FormData();
+      formData.append('file', file, filename);
+      
+      if (method) {
+        formData.append('method', method);
+      }
+
+      const response = await axios.post<ExtractionResult>(
+        `${this.baseUrl}/api/extract/pdf`,
+        formData,
+        {
+          headers: {
+            ...formData.getHeaders(),
+          },
+          timeout: 180000, // 3分钟超时（Nougat较慢）
+        }
+      );
+
+      return response.data;
+    } catch (error) {
+      console.error('[ExtractionClient] PDF extract failed:', error);
+      
+      if (axios.isAxiosError(error) && error.response) {
+        throw new Error(`PDF extraction failed: ${error.response.data.detail || error.message}`);
+      }
+      
+      throw new Error('PDF extraction failed');
+    }
+  }
+
+  /**
+   * Docx专用提取接口
+   */
+  async extractDocx(
+    file: Buffer,
+    filename: string
+  ): Promise<ExtractionResult> {
+    try {
+      const formData = new FormData();
+      formData.append('file', file, filename);
+
+      const response = await axios.post<ExtractionResult>(
+        `${this.baseUrl}/api/extract/docx`,
+        formData,
+        {
+          headers: {
+            ...formData.getHeaders(),
+          },
+          timeout: 60000, // 1分钟超时
+        }
+      );
+
+      return response.data;
+    } catch (error) {
+      console.error('[ExtractionClient] Docx extract failed:', error);
+      
+      if (axios.isAxiosError(error) && error.response) {
+        throw new Error(`Docx extraction failed: ${error.response.data.detail || error.message}`);
+      }
+      
+      throw new Error('Docx extraction failed');
+    }
+  }
+
+  /**
+   * Txt专用提取接口
+   */
+  async extractTxt(
+    file: Buffer,
+    filename: string
+  ): Promise<ExtractionResult> {
+    try {
+      const formData = new FormData();
+      formData.append('file', file, filename);
+
+      const response = await axios.post<ExtractionResult>(
+        `${this.baseUrl}/api/extract/txt`,
+        formData,
+        {
+          headers: {
+            ...formData.getHeaders(),
+          },
+          timeout: 30000, // 30秒超时
+        }
+      );
+
+      return response.data;
+    } catch (error) {
+      console.error('[ExtractionClient] Txt extract failed:', error);
+      
+      if (axios.isAxiosError(error) && error.response) {
+        throw new Error(`Txt extraction failed: ${error.response.data.detail || error.message}`);
+      }
+      
+      throw new Error('Txt extraction failed');
+    }
+  }
+
+  /**
+   * 检测PDF语言
+   */
+  async detectLanguage(
+    file: Buffer,
+    filename: string
+  ): Promise<{
+    language: string;
+    chinese_ratio: number;
+    chinese_chars: number;
+    total_chars: number;
+  }> {
+    try {
+      const formData = new FormData();
+      formData.append('file', file, filename);
+
+      const response = await axios.post(
+        `${this.baseUrl}/api/detect-language`,
+        formData,
+        {
+          headers: {
+            ...formData.getHeaders(),
+          },
+          timeout: 30000,
+        }
+      );
+
+      return response.data;
+    } catch (error) {
+      console.error('[ExtractionClient] Language detection failed:', error);
+      throw new Error('Language detection failed');
+    }
+  }
+
+  /**
+   * 获取PDF处理策略
+   */
+  async getPdfStrategy(
+    file: Buffer,
+    filename: string
+  ): Promise<{
+    detected_language: string;
+    recommended_method: string;
+    reason: string;
+    nougat_available: boolean;
+  }> {
+    try {
+      const formData = new FormData();
+      formData.append('file', file, filename);
+
+      const response = await axios.post(
+        `${this.baseUrl}/api/pdf-strategy`,
+        formData,
+        {
+          headers: {
+            ...formData.getHeaders(),
+          },
+          timeout: 30000,
+        }
+      );
+
+      return response.data;
+    } catch (error) {
+      console.error('[ExtractionClient] Get PDF strategy failed:', error);
+      throw new Error('Get PDF strategy failed');
+    }
+  }
+}
+
+// 导出单例
+export const extractionClient = new ExtractionClient();
+
+
+
+
+
+
--- a/backend/src/common/llm/adapters/DeepSeekAdapter.ts
+++ b/backend/src/common/llm/adapters/DeepSeekAdapter.ts
@@ -0,0 +1,150 @@
+import axios from 'axios';
+import { ILLMAdapter, Message, LLMOptions, LLMResponse, StreamChunk } from './types.js';
+import { config } from '../../../config/env.js';
+
+export class DeepSeekAdapter implements ILLMAdapter {
+  modelName: string;
+  private apiKey: string;
+  private baseURL: string;
+
+  constructor(modelName: string = 'deepseek-chat') {
+    this.modelName = modelName;
+    this.apiKey = config.deepseekApiKey || '';
+    this.baseURL = 'https://api.deepseek.com/v1';
+
+    if (!this.apiKey) {
+      throw new Error('DeepSeek API key is not configured');
+    }
+  }
+
+  // 非流式调用
+  async chat(messages: Message[], options?: LLMOptions): Promise<LLMResponse> {
+    try {
+      const response = await axios.post(
+        `${this.baseURL}/chat/completions`,
+        {
+          model: this.modelName,
+          messages: messages,
+          temperature: options?.temperature ?? 0.7,
+          max_tokens: options?.maxTokens ?? 2000,
+          top_p: options?.topP ?? 0.9,
+          stream: false,
+        },
+        {
+          headers: {
+            'Content-Type': 'application/json',
+            Authorization: `Bearer ${this.apiKey}`,
+          },
+          timeout: 180000, // 180秒超时（3分钟）- 稿件评估需要更长时间
+        }
+      );
+
+      const choice = response.data.choices[0];
+      
+      return {
+        content: choice.message.content,
+        model: response.data.model,
+        usage: {
+          promptTokens: response.data.usage.prompt_tokens,
+          completionTokens: response.data.usage.completion_tokens,
+          totalTokens: response.data.usage.total_tokens,
+        },
+        finishReason: choice.finish_reason,
+      };
+    } catch (error: unknown) {
+      console.error('DeepSeek API Error:', error);
+      if (axios.isAxiosError(error)) {
+        throw new Error(
+          `DeepSeek API调用失败: ${error.response?.data?.error?.message || error.message}`
+        );
+      }
+      throw error;
+    }
+  }
+
+  // 流式调用
+  async *chatStream(
+    messages: Message[],
+    options?: LLMOptions,
+    onChunk?: (chunk: StreamChunk) => void
+  ): AsyncGenerator<StreamChunk, void, unknown> {
+    try {
+      const response = await axios.post(
+        `${this.baseURL}/chat/completions`,
+        {
+          model: this.modelName,
+          messages: messages,
+          temperature: options?.temperature ?? 0.7,
+          max_tokens: options?.maxTokens ?? 2000,
+          top_p: options?.topP ?? 0.9,
+          stream: true,
+        },
+        {
+          headers: {
+            'Content-Type': 'application/json',
+            Authorization: `Bearer ${this.apiKey}`,
+          },
+          responseType: 'stream',
+          timeout: 60000,
+        }
+      );
+
+      const stream = response.data;
+      let buffer = '';
+
+      for await (const chunk of stream) {
+        buffer += chunk.toString();
+        const lines = buffer.split('\n');
+        buffer = lines.pop() || '';
+
+        for (const line of lines) {
+          const trimmedLine = line.trim();
+          if (!trimmedLine || trimmedLine === 'data: [DONE]') {
+            continue;
+          }
+
+          if (trimmedLine.startsWith('data: ')) {
+            try {
+              const jsonStr = trimmedLine.slice(6);
+              const data = JSON.parse(jsonStr);
+              
+              const choice = data.choices[0];
+              const content = choice.delta?.content || '';
+              
+              const streamChunk: StreamChunk = {
+                content: content,
+                done: choice.finish_reason === 'stop',
+                model: data.model,
+              };
+
+              if (choice.finish_reason === 'stop' && data.usage) {
+                streamChunk.usage = {
+                  promptTokens: data.usage.prompt_tokens,
+                  completionTokens: data.usage.completion_tokens,
+                  totalTokens: data.usage.total_tokens,
+                };
+              }
+
+              if (onChunk) {
+                onChunk(streamChunk);
+              }
+
+              yield streamChunk;
+            } catch (parseError) {
+              console.error('Failed to parse SSE data:', parseError);
+            }
+          }
+        }
+      }
+    } catch (error) {
+      console.error('DeepSeek Stream Error:', error);
+      if (axios.isAxiosError(error)) {
+        throw new Error(
+          `DeepSeek流式调用失败: ${error.response?.data?.error?.message || error.message}`
+        );
+      }
+      throw error;
+    }
+  }
+}
+
--- a/backend/src/common/llm/adapters/LLMFactory.ts
+++ b/backend/src/common/llm/adapters/LLMFactory.ts
@@ -0,0 +1,84 @@
+import { ILLMAdapter, ModelType } from './types.js';
+import { DeepSeekAdapter } from './DeepSeekAdapter.js';
+import { QwenAdapter } from './QwenAdapter.js';
+
+/**
+ * LLM工厂类
+ * 根据模型类型创建相应的适配器实例
+ */
+export class LLMFactory {
+  private static adapters: Map<string, ILLMAdapter> = new Map();
+
+  /**
+   * 获取LLM适配器实例（单例模式）
+   * @param modelType 模型类型
+   * @returns LLM适配器实例
+   */
+  static getAdapter(modelType: ModelType): ILLMAdapter {
+    // 如果已经创建过该适配器，直接返回
+    if (this.adapters.has(modelType)) {
+      return this.adapters.get(modelType)!;
+    }
+
+    // 根据模型类型创建适配器
+    let adapter: ILLMAdapter;
+
+    switch (modelType) {
+      case 'deepseek-v3':
+        adapter = new DeepSeekAdapter('deepseek-chat');
+        break;
+
+      case 'qwen3-72b':
+        adapter = new QwenAdapter('qwen-plus'); // Qwen3-72B对应的模型名
+        break;
+
+      case 'qwen-long':
+        adapter = new QwenAdapter('qwen-long'); // 1M上下文超长文本模型
+        break;
+
+      case 'gemini-pro':
+        // TODO: 实现Gemini适配器
+        throw new Error('Gemini adapter is not implemented yet');
+
+      default:
+        throw new Error(`Unsupported model type: ${modelType}`);
+    }
+
+    // 缓存适配器实例
+    this.adapters.set(modelType, adapter);
+    return adapter;
+  }
+
+  /**
+   * 清除适配器缓存
+   * @param modelType 可选，指定清除某个模型的适配器，不传则清除所有
+   */
+  static clearCache(modelType?: ModelType): void {
+    if (modelType) {
+      this.adapters.delete(modelType);
+    } else {
+      this.adapters.clear();
+    }
+  }
+
+  /**
+   * 检查模型是否支持
+   * @param modelType 模型类型
+   * @returns 是否支持
+   */
+  static isSupported(modelType: string): boolean {
+    return ['deepseek-v3', 'qwen3-72b', 'qwen-long', 'gemini-pro'].includes(modelType);
+  }
+
+  /**
+   * 获取所有支持的模型列表
+   * @returns 支持的模型列表
+   */
+  static getSupportedModels(): ModelType[] {
+    return ['deepseek-v3', 'qwen3-72b', 'qwen-long', 'gemini-pro'];
+  }
+}
+
+
+
+
--- a/backend/src/common/llm/adapters/QwenAdapter.ts
+++ b/backend/src/common/llm/adapters/QwenAdapter.ts
@@ -0,0 +1,171 @@
+import axios from 'axios';
+import { ILLMAdapter, Message, LLMOptions, LLMResponse, StreamChunk } from './types.js';
+import { config } from '../../../config/env.js';
+
+export class QwenAdapter implements ILLMAdapter {
+  modelName: string;
+  private apiKey: string;
+  private baseURL: string;
+
+  constructor(modelName: string = 'qwen-turbo') {
+    this.modelName = modelName;
+    this.apiKey = config.dashscopeApiKey || '';
+    this.baseURL = 'https://dashscope.aliyuncs.com/api/v1/services/aigc/text-generation/generation';
+
+    if (!this.apiKey) {
+      throw new Error('DashScope API key is not configured. Please set DASHSCOPE_API_KEY in .env file.');
+    }
+  }
+
+  // 非流式调用
+  async chat(messages: Message[], options?: LLMOptions): Promise<LLMResponse> {
+    try {
+      const response = await axios.post(
+        this.baseURL,
+        {
+          model: this.modelName,
+          input: {
+            messages: messages,
+          },
+          parameters: {
+            temperature: options?.temperature ?? 0.7,
+            max_tokens: options?.maxTokens ?? 2000,
+            top_p: options?.topP ?? 0.9,
+            result_format: 'message',
+          },
+        },
+        {
+          headers: {
+            'Content-Type': 'application/json',
+            Authorization: `Bearer ${this.apiKey}`,
+          },
+          timeout: 180000, // 180秒超时（3分钟）- 稿件评估需要更长时间
+        }
+      );
+
+      const output = response.data.output;
+      const usage = response.data.usage;
+
+      return {
+        content: output.choices[0].message.content,
+        model: this.modelName,
+        usage: {
+          promptTokens: usage.input_tokens,
+          completionTokens: usage.output_tokens,
+          totalTokens: usage.total_tokens || usage.input_tokens + usage.output_tokens,
+        },
+        finishReason: output.choices[0].finish_reason,
+      };
+    } catch (error: unknown) {
+      console.error('Qwen API Error:', error);
+      if (axios.isAxiosError(error)) {
+        throw new Error(
+          `Qwen API调用失败: ${error.response?.data?.message || error.message}`
+        );
+      }
+      throw error;
+    }
+  }
+
+  // 流式调用
+  async *chatStream(
+    messages: Message[],
+    options?: LLMOptions,
+    onChunk?: (chunk: StreamChunk) => void
+  ): AsyncGenerator<StreamChunk, void, unknown> {
+    try {
+      // Qwen-Long需要更长的超时时间（全文模式可能传输~750K tokens）
+      const timeout = this.modelName === 'qwen-long' ? 300000 : 60000; // 5分钟 vs 1分钟
+      
+      console.log(`[QwenAdapter] 开始流式调用`, {
+        model: this.modelName,
+        timeout: `${timeout / 1000}秒`,
+        messagesCount: messages.length,
+      });
+
+      const response = await axios.post(
+        this.baseURL,
+        {
+          model: this.modelName,
+          input: {
+            messages: messages,
+          },
+          parameters: {
+            temperature: options?.temperature ?? 0.7,
+            max_tokens: options?.maxTokens ?? 2000,
+            top_p: options?.topP ?? 0.9,
+            result_format: 'message',
+            incremental_output: true,
+          },
+        },
+        {
+          headers: {
+            'Content-Type': 'application/json',
+            Authorization: `Bearer ${this.apiKey}`,
+            'X-DashScope-SSE': 'enable',
+          },
+          responseType: 'stream',
+          timeout: timeout,
+        }
+      );
+
+      const stream = response.data;
+      let buffer = '';
+
+      for await (const chunk of stream) {
+        buffer += chunk.toString();
+        const lines = buffer.split('\n');
+        buffer = lines.pop() || '';
+
+        for (const line of lines) {
+          const trimmedLine = line.trim();
+          if (!trimmedLine || trimmedLine.startsWith(':')) {
+            continue;
+          }
+
+          if (trimmedLine.startsWith('data:')) {
+            try {
+              const jsonStr = trimmedLine.slice(5).trim();
+              const data = JSON.parse(jsonStr);
+              
+              const output = data.output;
+              const choice = output.choices[0];
+              const content = choice.message?.content || '';
+              
+              const streamChunk: StreamChunk = {
+                content: content,
+                done: choice.finish_reason === 'stop',
+                model: this.modelName,
+              };
+
+              if (choice.finish_reason === 'stop' && data.usage) {
+                streamChunk.usage = {
+                  promptTokens: data.usage.input_tokens,
+                  completionTokens: data.usage.output_tokens,
+                  totalTokens: data.usage.total_tokens || data.usage.input_tokens + data.usage.output_tokens,
+                };
+              }
+
+              if (onChunk) {
+                onChunk(streamChunk);
+              }
+
+              yield streamChunk;
+            } catch (parseError) {
+              console.error('Failed to parse Qwen SSE data:', parseError);
+            }
+          }
+        }
+      }
+    } catch (error) {
+      console.error('Qwen Stream Error:', error);
+      if (axios.isAxiosError(error)) {
+        throw new Error(
+          `Qwen流式调用失败: ${error.response?.data?.message || error.message}`
+        );
+      }
+      throw error;
+    }
+  }
+}
+
--- a/backend/src/common/llm/adapters/types.ts
+++ b/backend/src/common/llm/adapters/types.ts
@@ -0,0 +1,58 @@
+// LLM适配器类型定义
+
+export interface Message {
+  role: 'system' | 'user' | 'assistant';
+  content: string;
+}
+
+export interface LLMOptions {
+  temperature?: number;
+  maxTokens?: number;
+  topP?: number;
+  stream?: boolean;
+}
+
+export interface LLMResponse {
+  content: string;
+  model: string;
+  usage?: {
+    promptTokens: number;
+    completionTokens: number;
+    totalTokens: number;
+  };
+  finishReason?: string;
+}
+
+export interface StreamChunk {
+  content: string;
+  done: boolean;
+  model?: string;
+  usage?: {
+    promptTokens: number;
+    completionTokens: number;
+    totalTokens: number;
+  };
+}
+
+// LLM适配器接口
+export interface ILLMAdapter {
+  // 模型名称
+  modelName: string;
+
+  // 非流式调用
+  chat(messages: Message[], options?: LLMOptions): Promise<LLMResponse>;
+
+  // 流式调用
+  chatStream(
+    messages: Message[],
+    options?: LLMOptions,
+    onChunk?: (chunk: StreamChunk) => void
+  ): AsyncGenerator<StreamChunk, void, unknown>;
+}
+
+// 支持的模型类型
+export type ModelType = 'deepseek-v3' | 'qwen3-72b' | 'qwen-long' | 'gemini-pro';
+
+
+
+
--- a/backend/src/common/middleware/validateProject.ts
+++ b/backend/src/common/middleware/validateProject.ts
@@ -0,0 +1,113 @@
+import { FastifyRequest, FastifyReply } from 'fastify';
+
+interface CreateProjectBody {
+  name: string;
+  background?: string;
+  researchType: 'observational' | 'interventional';
+}
+
+interface UpdateProjectBody {
+  name?: string;
+  background?: string;
+  researchType?: 'observational' | 'interventional';
+}
+
+// 验证创建项目请求
+export async function validateProjectCreate(request: FastifyRequest, reply: FastifyReply) {
+  const body = request.body as CreateProjectBody;
+
+  // 验证必填字段
+  if (!body.name || typeof body.name !== 'string') {
+    return reply.code(400).send({
+      success: false,
+      message: '项目名称为必填项',
+    });
+  }
+
+  if (body.name.trim().length === 0) {
+    return reply.code(400).send({
+      success: false,
+      message: '项目名称不能为空',
+    });
+  }
+
+  if (body.name.length > 100) {
+    return reply.code(400).send({
+      success: false,
+      message: '项目名称不能超过100个字符',
+    });
+  }
+
+  // 验证研究类型
+  if (!body.researchType) {
+    return reply.code(400).send({
+      success: false,
+      message: '研究类型为必填项',
+    });
+  }
+
+  if (!['observational', 'interventional'].includes(body.researchType)) {
+    return reply.code(400).send({
+      success: false,
+      message: '研究类型必须是observational或interventional',
+    });
+  }
+
+  // 验证项目背景（可选，但有长度限制）
+  if (body.background && body.background.length > 2000) {
+    return reply.code(400).send({
+      success: false,
+      message: '项目背景不能超过2000个字符',
+    });
+  }
+}
+
+// 验证更新项目请求
+export async function validateProjectUpdate(request: FastifyRequest, reply: FastifyReply) {
+  const body = request.body as UpdateProjectBody;
+
+  // 至少需要更新一个字段
+  if (!body.name && !body.background && !body.researchType) {
+    return reply.code(400).send({
+      success: false,
+      message: '至少需要提供一个要更新的字段',
+    });
+  }
+
+  // 验证项目名称
+  if (body.name !== undefined) {
+    if (typeof body.name !== 'string' || body.name.trim().length === 0) {
+      return reply.code(400).send({
+        success: false,
+        message: '项目名称不能为空',
+      });
+    }
+
+    if (body.name.length > 100) {
+      return reply.code(400).send({
+        success: false,
+        message: '项目名称不能超过100个字符',
+      });
+    }
+  }
+
+  // 验证研究类型
+  if (body.researchType && !['observational', 'interventional'].includes(body.researchType)) {
+    return reply.code(400).send({
+      success: false,
+      message: '研究类型必须是observational或interventional',
+    });
+  }
+
+  // 验证项目背景
+  if (body.background && body.background.length > 2000) {
+    return reply.code(400).send({
+      success: false,
+      message: '项目背景不能超过2000个字符',
+    });
+  }
+}
+
+
+
+
--- a/backend/src/common/rag/DifyClient.ts
+++ b/backend/src/common/rag/DifyClient.ts
@@ -0,0 +1,323 @@
+import axios, { AxiosInstance, AxiosError } from 'axios';
+import FormData from 'form-data';
+import {
+  Dataset,
+  CreateDatasetRequest,
+  CreateDatasetResponse,
+  DatasetListResponse,
+  Document,
+  DocumentListResponse,
+  CreateDocumentByFileRequest,
+  CreateDocumentResponse,
+  RetrievalRequest,
+  RetrievalResponse,
+  DifyError,
+  DifyErrorResponse,
+} from './types.js';
+import { config } from '../../config/env.js';
+
+/**
+ * Dify API 客户端
+ * 
+ * 封装 Dify 知识库相关 API
+ */
+export class DifyClient {
+  private client: AxiosInstance;
+  private apiKey: string;
+  private apiUrl: string;
+
+  constructor(apiKey?: string, apiUrl?: string) {
+    this.apiKey = apiKey || config.difyApiKey;
+    this.apiUrl = apiUrl || config.difyApiUrl;
+
+    if (!this.apiKey) {
+      throw new Error('Dify API Key is required');
+    }
+
+    if (!this.apiUrl) {
+      throw new Error('Dify API URL is required');
+    }
+
+    // 创建 axios 实例
+    this.client = axios.create({
+      baseURL: this.apiUrl,
+      headers: {
+        'Authorization': `Bearer ${this.apiKey}`,
+        'Content-Type': 'application/json',
+      },
+      timeout: 30000, // 30秒超时
+    });
+
+    // 响应拦截器：统一错误处理
+    this.client.interceptors.response.use(
+      (response) => response,
+      (error: AxiosError) => {
+        if (error.response?.data) {
+          const errorData = error.response.data as DifyErrorResponse;
+          throw new DifyError({
+            code: errorData.code || 'UNKNOWN_ERROR',
+            message: errorData.message || error.message,
+            status: error.response.status,
+          });
+        }
+        throw error;
+      }
+    );
+  }
+
+  // ==================== 知识库管理 API ====================
+
+  /**
+   * 创建知识库
+   * 
+   * @param params 创建参数
+   * @returns 创建的知识库信息
+   */
+  async createDataset(params: CreateDatasetRequest): Promise<CreateDatasetResponse> {
+    const response = await this.client.post<CreateDatasetResponse>('/datasets', params);
+    return response.data;
+  }
+
+  /**
+   * 获取知识库列表
+   * 
+   * @param page 页码（从1开始）
+   * @param limit 每页数量（默认20）
+   * @returns 知识库列表
+   */
+  async getDatasets(page: number = 1, limit: number = 20): Promise<DatasetListResponse> {
+    const response = await this.client.get<DatasetListResponse>('/datasets', {
+      params: { page, limit },
+    });
+    return response.data;
+  }
+
+  /**
+   * 获取知识库详情
+   * 
+   * @param datasetId 知识库ID
+   * @returns 知识库信息
+   */
+  async getDataset(datasetId: string): Promise<Dataset> {
+    const response = await this.client.get<Dataset>(`/datasets/${datasetId}`);
+    return response.data;
+  }
+
+  /**
+   * 删除知识库
+   * 
+   * @param datasetId 知识库ID
+   */
+  async deleteDataset(datasetId: string): Promise<void> {
+    await this.client.delete(`/datasets/${datasetId}`);
+  }
+
+  // ==================== 文档管理 API ====================
+
+  /**
+   * 直接上传文档到知识库（简化版）
+   * 
+   * @param datasetId 知识库ID
+   * @param file 文件 Buffer
+   * @param filename 文件名
+   * @param params 创建参数
+   * @returns 创建的文档信息
+   */
+  async uploadDocumentDirectly(
+    datasetId: string,
+    file: Buffer,
+    filename: string,
+    params?: Partial<CreateDocumentByFileRequest>
+  ): Promise<CreateDocumentResponse> {
+    const formData = new FormData();
+    formData.append('file', file, filename);
+    
+    // 添加其他参数
+    const defaultParams = {
+      indexing_technique: 'high_quality',
+      process_rule: {
+        mode: 'automatic',
+        rules: {
+          pre_processing_rules: [
+            { id: 'remove_extra_spaces', enabled: true },
+            { id: 'remove_urls_emails', enabled: false },
+          ],
+          segmentation: {
+            separator: '\n',
+            max_tokens: 1500,  // Phase 1优化：从500增加到1500 tokens
+          },
+        },
+      },
+      ...params,
+    };
+
+    formData.append('data', JSON.stringify(defaultParams));
+
+    const response = await this.client.post<CreateDocumentResponse>(
+      `/datasets/${datasetId}/document/create_by_file`,
+      formData,
+      {
+        headers: {
+          ...formData.getHeaders(),
+          'Authorization': `Bearer ${this.apiKey}`,
+        },
+      }
+    );
+
+    return response.data;
+  }
+
+  /**
+   * 获取文档列表
+   * 
+   * @param datasetId 知识库ID
+   * @param page 页码（从1开始）
+   * @param limit 每页数量（默认20）
+   * @returns 文档列表
+   */
+  async getDocuments(
+    datasetId: string,
+    page: number = 1,
+    limit: number = 20
+  ): Promise<DocumentListResponse> {
+    const response = await this.client.get<DocumentListResponse>(
+      `/datasets/${datasetId}/documents`,
+      {
+        params: { page, limit },
+      }
+    );
+    return response.data;
+  }
+
+  /**
+   * 获取文档详情
+   * 
+   * @param datasetId 知识库ID
+   * @param documentId 文档ID
+   * @returns 文档信息
+   */
+  async getDocument(datasetId: string, documentId: string): Promise<Document> {
+    const response = await this.client.get<Document>(
+      `/datasets/${datasetId}/documents/${documentId}`
+    );
+    return response.data;
+  }
+
+  /**
+   * 删除文档
+   * 
+   * @param datasetId 知识库ID
+   * @param documentId 文档ID
+   */
+  async deleteDocument(datasetId: string, documentId: string): Promise<void> {
+    await this.client.delete(`/datasets/${datasetId}/documents/${documentId}`);
+  }
+
+  /**
+   * 更新文档（重新索引）
+   * 
+   * @param datasetId 知识库ID
+   * @param documentId 文档ID
+   */
+  async updateDocument(datasetId: string, documentId: string): Promise<void> {
+    await this.client.post(`/datasets/${datasetId}/documents/${documentId}/processing`);
+  }
+
+  // ==================== 知识库检索 API ====================
+
+  /**
+   * 检索知识库
+   * 
+   * @param datasetId 知识库ID
+   * @param query 查询文本
+   * @param params 检索参数
+   * @returns 检索结果
+   */
+  async retrieveKnowledge(
+    datasetId: string,
+    query: string,
+    params?: Partial<RetrievalRequest>
+  ): Promise<RetrievalResponse> {
+    const requestParams: RetrievalRequest = {
+      query,
+      retrieval_model: {
+        search_method: 'semantic_search',
+        reranking_enable: false,
+        top_k: 3,
+        score_threshold_enabled: false,
+        ...params?.retrieval_model,
+      },
+    };
+
+    const response = await this.client.post<RetrievalResponse>(
+      `/datasets/${datasetId}/retrieve`,
+      requestParams
+    );
+
+    return response.data;
+  }
+
+  // ==================== 辅助方法 ====================
+
+  /**
+   * 轮询检查文档处理状态
+   * 
+   * @param datasetId 知识库ID
+   * @param documentId 文档ID
+   * @param maxAttempts 最大尝试次数（默认30次）
+   * @param interval 轮询间隔（毫秒，默认2000ms）
+   * @returns 文档信息
+   */
+  async waitForDocumentProcessing(
+    datasetId: string,
+    documentId: string,
+    maxAttempts: number = 30,
+    interval: number = 2000
+  ): Promise<Document> {
+    for (let i = 0; i < maxAttempts; i++) {
+      const document = await this.getDocument(datasetId, documentId);
+
+      if (document.indexing_status === 'completed') {
+        return document;
+      }
+
+      if (document.indexing_status === 'error') {
+        throw new Error(`Document processing failed: ${document.error || 'Unknown error'}`);
+      }
+
+      // 等待后继续
+      await new Promise((resolve) => setTimeout(resolve, interval));
+    }
+
+    throw new Error('Document processing timeout');
+  }
+
+  /**
+   * 一键上传文档到知识库（上传 + 等待处理完成）
+   * 
+   * @param datasetId 知识库ID
+   * @param file 文件 Buffer
+   * @param filename 文件名
+   * @returns 处理完成的文档信息
+   */
+  async uploadAndProcessDocument(
+    datasetId: string,
+    file: Buffer,
+    filename: string
+  ): Promise<Document> {
+    // 1. 直接上传文档
+    const createResult = await this.uploadDocumentDirectly(datasetId, file, filename);
+
+    // 2. 等待处理完成
+    const document = await this.waitForDocumentProcessing(
+      datasetId,
+      createResult.document.id
+    );
+
+    return document;
+  }
+}
+
+// 导出单例实例
+export const difyClient = new DifyClient();
+
--- a/backend/src/common/rag/types.ts
+++ b/backend/src/common/rag/types.ts
@@ -0,0 +1,231 @@
+/**
+ * Dify API 类型定义
+ */
+
+// ==================== 知识库相关类型 ====================
+
+/**
+ * 知识库信息
+ */
+export interface Dataset {
+  id: string;
+  name: string;
+  description: string;
+  permission: 'only_me' | 'all_team_members';
+  data_source_type: 'upload_file' | 'notion_import' | 'website_crawl';
+  indexing_technique: 'high_quality' | 'economy';
+  app_count: number;
+  document_count: number;
+  word_count: number;
+  created_by: string;
+  created_at: number;
+  updated_by: string;
+  updated_at: number;
+}
+
+/**
+ * 创建知识库请求参数
+ */
+export interface CreateDatasetRequest {
+  name: string;
+  description?: string;
+  permission?: 'only_me' | 'all_team_members';
+  indexing_technique?: 'high_quality' | 'economy';
+  embedding_model?: string;
+  embedding_model_provider?: string;
+  retrieval_model?: {
+    search_method: 'semantic_search' | 'full_text_search' | 'hybrid_search';
+    reranking_enable?: boolean;
+    reranking_model?: {
+      reranking_provider_name: string;
+      reranking_model_name: string;
+    };
+    top_k?: number;
+    score_threshold_enabled?: boolean;
+    score_threshold?: number;
+  };
+}
+
+/**
+ * 创建知识库响应
+ */
+export interface CreateDatasetResponse {
+  id: string;
+  name: string;
+  description: string;
+  permission: string;
+  data_source_type: string;
+  indexing_technique: string;
+  created_by: string;
+  created_at: number;
+}
+
+/**
+ * 知识库列表响应
+ */
+export interface DatasetListResponse {
+  data: Dataset[];
+  has_more: boolean;
+  limit: number;
+  total: number;
+  page: number;
+}
+
+// ==================== 文档相关类型 ====================
+
+/**
+ * 文档信息
+ */
+export interface Document {
+  id: string;
+  position: number;
+  data_source_type: string;
+  data_source_info: {
+    upload_file_id: string;
+  };
+  dataset_process_rule_id: string;
+  name: string;
+  created_from: string;
+  created_by: string;
+  created_at: number;
+  tokens: number;
+  indexing_status: 'waiting' | 'parsing' | 'cleaning' | 'splitting' | 'indexing' | 'completed' | 'error' | 'paused';
+  error?: string;
+  enabled: boolean;
+  disabled_at?: number;
+  disabled_by?: string;
+  archived: boolean;
+  display_status: string;
+  word_count: number;
+  hit_count: number;
+}
+
+/**
+ * 文档列表响应
+ */
+export interface DocumentListResponse {
+  data: Document[];
+  has_more: boolean;
+  limit: number;
+  total: number;
+  page: number;
+}
+
+/**
+ * 上传文件响应
+ */
+export interface UploadFileResponse {
+  id: string;
+  name: string;
+  size: number;
+  extension: string;
+  mime_type: string;
+  created_by: string;
+  created_at: number;
+}
+
+/**
+ * 创建文档（从上传的文件）请求参数
+ */
+export interface CreateDocumentByFileRequest {
+  indexing_technique: 'high_quality' | 'economy';
+  process_rule: {
+    rules: {
+      pre_processing_rules: Array<{
+        id: string;
+        enabled: boolean;
+      }>;
+      segmentation: {
+        separator: string;
+        max_tokens: number;
+      };
+    };
+    mode: 'automatic' | 'custom';
+  };
+  original_document_id?: string;
+  doc_form?: 'text_model' | 'qa_model';
+  doc_language?: string;
+}
+
+/**
+ * 创建文档响应
+ */
+export interface CreateDocumentResponse {
+  document: Document;
+  batch: string;
+}
+
+// ==================== 知识库检索相关类型 ====================
+
+/**
+ * 知识库检索请求参数
+ */
+export interface RetrievalRequest {
+  query: string;
+  retrieval_model?: {
+    search_method?: 'semantic_search' | 'full_text_search' | 'hybrid_search';
+    reranking_enable?: boolean;
+    reranking_model?: {
+      reranking_provider_name: string;
+      reranking_model_name: string;
+    };
+    top_k?: number;
+    score_threshold_enabled?: boolean;
+    score_threshold?: number;
+  };
+}
+
+/**
+ * 检索结果项
+ */
+export interface RetrievalRecord {
+  segment_id: string;
+  document_id: string;
+  document_name: string;
+  position: number;
+  score: number;
+  content: string;
+  hit_count: number;
+  word_count: number;
+  segment_position: number;
+  index_node_hash: string;
+  metadata: Record<string, any>;
+}
+
+/**
+ * 知识库检索响应
+ */
+export interface RetrievalResponse {
+  query: {
+    content: string;
+  };
+  records: RetrievalRecord[];
+}
+
+// ==================== 错误类型 ====================
+
+/**
+ * Dify API 错误响应
+ */
+export interface DifyErrorResponse {
+  code: string;
+  message: string;
+  status: number;
+}
+
+/**
+ * Dify API 错误
+ */
+export class DifyError extends Error {
+  code: string;
+  status: number;
+
+  constructor(error: DifyErrorResponse) {
+    super(error.message);
+    this.name = 'DifyError';
+    this.code = error.code;
+    this.status = error.status;
+  }
+}
+
+
--- a/backend/src/common/utils/jsonParser.ts
+++ b/backend/src/common/utils/jsonParser.ts
@@ -0,0 +1,152 @@
+/**
+ * Phase 3: 批处理模式 - JSON解析工具
+ * 
+ * AI的输出可能包含额外的文字说明，需要提取JSON块并解析
+ */
+
+export interface ParseResult<T = any> {
+  success: boolean;
+  data?: T;
+  error?: string;
+  rawOutput: string;
+}
+
+/**
+ * 从AI输出中提取JSON块
+ * 
+ * 支持的格式：
+ * 1. 纯JSON：{ "key": "value" }
+ * 2. 带前言：这是提取结果：\n{ "key": "value" }
+ * 3. 带后缀：{ "key": "value" }\n\n以上是提取结果
+ * 4. 代码块：```json\n{ "key": "value" }\n```
+ */
+export function extractJSON(text: string): string | null {
+  // 尝试1：直接查找 {...} 或 [...]
+  const jsonPattern = /(\{[\s\S]*\}|\[[\s\S]*\])/;
+  const match = text.match(jsonPattern);
+  
+  if (match) {
+    return match[1];
+  }
+  
+  // 尝试2：查找代码块中的JSON
+  const codeBlockPattern = /```(?:json)?\s*\n?([\s\S]*?)\n?```/;
+  const codeMatch = text.match(codeBlockPattern);
+  
+  if (codeMatch) {
+    return codeMatch[1].trim();
+  }
+  
+  return null;
+}
+
+/**
+ * 解析JSON字符串
+ * 
+ * @param jsonString JSON字符串
+ * @param expectedFields 期望的字段列表（可选，用于验证）
+ * @returns 解析结果
+ */
+export function parseJSON<T = any>(
+  jsonString: string,
+  expectedFields?: string[]
+): ParseResult<T> {
+  const rawOutput = jsonString;
+  
+  try {
+    // 提取JSON块
+    const extracted = extractJSON(jsonString);
+    
+    if (!extracted) {
+      return {
+        success: false,
+        error: '未找到JSON格式的数据',
+        rawOutput,
+      };
+    }
+    
+    // 解析JSON
+    const data = JSON.parse(extracted) as T;
+    
+    // 验证字段（如果提供了expectedFields）
+    if (expectedFields && Array.isArray(expectedFields)) {
+      const missingFields: string[] = [];
+      
+      for (const field of expectedFields) {
+        if (!(field in (data as any))) {
+          missingFields.push(field);
+        }
+      }
+      
+      if (missingFields.length > 0) {
+        console.warn(`[JsonParser] 缺少字段: ${missingFields.join(', ')}`);
+        // 为缺失字段填充默认值
+        for (const field of missingFields) {
+          (data as any)[field] = '未提取到';
+        }
+      }
+    }
+    
+    return {
+      success: true,
+      data,
+      rawOutput,
+    };
+    
+  } catch (error: any) {
+    return {
+      success: false,
+      error: error.message,
+      rawOutput,
+    };
+  }
+}
+
+/**
+ * 验证JSON数据是否符合模板要求
+ * 
+ * @param data 解析后的数据
+ * @param templateFields 模板字段定义
+ * @returns 是否有效
+ */
+export function validateTemplateData(
+  data: any,
+  templateFields: Array<{ key: string; type: string }>
+): { valid: boolean; errors: string[] } {
+  const errors: string[] = [];
+  
+  if (!data || typeof data !== 'object') {
+    errors.push('数据不是有效的对象');
+    return { valid: false, errors };
+  }
+  
+  for (const field of templateFields) {
+    const value = data[field.key];
+    
+    // 检查字段是否存在
+    if (value === undefined || value === null || value === '') {
+      console.warn(`[JsonParser] 字段 ${field.key} 为空`);
+      // 不算错误，只是警告
+    }
+    
+    // 类型检查（宽松）
+    if (field.type === 'number' && typeof value !== 'number' && value !== '') {
+      // 尝试转换
+      const num = Number(value);
+      if (!isNaN(num)) {
+        data[field.key] = num;
+      }
+    }
+  }
+  
+  return { valid: errors.length === 0, errors };
+}
+
+
+
+
+
+
+
+
+