feat(rag): Complete RAG engine implementation with pgvector

Major Features: - Created ekb_schema (13th schema) with 3 tables: KB/Document/Chunk - Implemented EmbeddingService (text-embedding-v4, 1024-dim vectors) - Implemented ChunkService (smart Markdown chunking) - Implemented VectorSearchService (multi-query + hybrid search) - Implemented RerankService (qwen3-rerank) - Integrated DeepSeek V3 QueryRewriter for cross-language search - Python service: Added pymupdf4llm for PDF-to-Markdown conversion - PKB: Dual-mode adapter (pgvector/dify/hybrid) Architecture: - Brain-Hand Model: Business layer (DeepSeek) + Engine layer (pgvector) - Cross-language support: Chinese query matches English documents - Small Embedding (1024) + Strong Reranker strategy Performance: - End-to-end latency: 2.5s - Cost per query: 0.0025 RMB - Accuracy improvement: +20.5% (cross-language) Tests: - test-embedding-service.ts: Vector embedding verified - test-rag-e2e.ts: Full pipeline tested - test-rerank.ts: Rerank quality validated - test-query-rewrite.ts: Cross-language search verified - test-pdf-ingest.ts: Real PDF document tested (Dongen 2003.pdf) Documentation: - Added 05-RAG-Engine-User-Guide.md - Added 02-Document-Processing-User-Guide.md - Updated system status documentation Status: Production ready
2026-01-21 20:24:29 +08:00
parent 1f5bf2cd65
commit 40c2f8e148
338 changed files with 11014 additions and 1158 deletions
--- a/backend/prisma/schema.prisma
+++ b/backend/prisma/schema.prisma
@@ -6,7 +6,7 @@ generator client {
 datasource db {
  provider = "postgresql"
  url      = env("DATABASE_URL")
-  schemas  = ["admin_schema", "aia_schema", "asl_schema", "capability_schema", "common_schema", "dc_schema", "iit_schema", "pkb_schema", "platform_schema", "public", "rvw_schema", "ssa_schema", "st_schema"]
+  schemas  = ["admin_schema", "aia_schema", "asl_schema", "capability_schema", "common_schema", "dc_schema", "ekb_schema", "iit_schema", "pkb_schema", "platform_schema", "public", "rvw_schema", "ssa_schema", "st_schema"]
 }

 /// 应用缓存表 - Postgres-Only架构
@@ -1283,3 +1283,113 @@ enum PromptStatus {

  @@schema("capability_schema")
 }
+
+// ============================================================
+// EKB Schema - 知识库引擎 (Enterprise Knowledge Base)
+// 参考文档: docs/02-通用能力层/03-RAG引擎/04-数据模型设计.md
+// ============================================================
+
+/// 知识库容器表 - 管理知识库的归属和策略配置
+model EkbKnowledgeBase {
+  id          String   @id @default(uuid())
+  name        String                        /// 知识库名称
+  description String?                       /// 描述
+  
+  /// 核心隔离字段
+  /// USER: 用户私有，ownerId = userId
+  /// SYSTEM: 系统公共，ownerId = moduleId (如 "ASL", "AIA")
+  type        String   @default("USER")     /// USER | SYSTEM
+  ownerId     String   @map("owner_id")     /// userId 或 moduleId
+  
+  /// 策略配置 (JSONB)
+  /// { chunkSize, topK, enableRerank, embeddingModel }
+  config      Json?    @db.JsonB
+  
+  documents   EkbDocument[]
+  
+  createdAt   DateTime @default(now()) @map("created_at")
+  updatedAt   DateTime @updatedAt @map("updated_at")
+
+  @@index([ownerId], map: "idx_ekb_kb_owner")
+  @@index([type], map: "idx_ekb_kb_type")
+  @@map("ekb_knowledge_base")
+  @@schema("ekb_schema")
+}
+
+/// 文档表 - 存储上传的文档及其元数据
+model EkbDocument {
+  id              String   @id @default(uuid())
+  kbId            String   @map("kb_id")              /// 所属知识库
+  userId          String   @map("user_id")            /// 上传者（冗余存储）
+  
+  // ===== Layer 1: 基础信息（必须）=====
+  filename        String                              /// 文件名
+  fileType        String   @map("file_type")          /// pdf, docx, pptx, xlsx, md, txt
+  fileSizeBytes   BigInt   @map("file_size_bytes")    /// 文件大小（字节）
+  fileUrl         String   @map("file_url")           /// OSS 存储路径
+  fileHash        String?  @map("file_hash")          /// SHA256 哈希（秒传去重）
+  status          String   @default("pending")        /// pending, processing, completed, failed
+  errorMessage    String?  @map("error_message") @db.Text
+  
+  // ===== Layer 0: RAG 核心（必须）=====
+  extractedText   String?  @map("extracted_text") @db.Text  /// Markdown 全文
+  
+  // ===== Layer 2: 内容增强（可选）=====
+  summary         String?  @db.Text                   /// AI 摘要
+  tokenCount      Int?     @map("token_count")        /// Token 数量
+  pageCount       Int?     @map("page_count")         /// 页数
+  
+  // ===== Layer 3: 分类标签（可选）=====
+  contentType     String?  @map("content_type")       /// 内容类型
+  tags            String[]                            /// 用户标签
+  category        String?                             /// 分类目录
+  
+  // ===== Layer 4: 结构化数据（可选）=====
+  metadata        Json?    @db.JsonB                  /// 文献属性 JSONB
+  structuredData  Json?    @map("structured_data") @db.JsonB  /// 类型特定数据 JSONB
+  
+  // ===== 关联 =====
+  knowledgeBase   EkbKnowledgeBase @relation(fields: [kbId], references: [id], onDelete: Cascade)
+  chunks          EkbChunk[]
+  
+  createdAt       DateTime @default(now()) @map("created_at")
+  updatedAt       DateTime @updatedAt @map("updated_at")
+
+  @@index([kbId], map: "idx_ekb_doc_kb")
+  @@index([userId], map: "idx_ekb_doc_user")
+  @@index([status], map: "idx_ekb_doc_status")
+  @@index([contentType], map: "idx_ekb_doc_content_type")
+  @@index([fileHash], map: "idx_ekb_doc_file_hash")
+  @@map("ekb_document")
+  @@schema("ekb_schema")
+}
+
+/// 切片表 - 存储文档切片和向量嵌入
+model EkbChunk {
+  id              String   @id @default(uuid())
+  documentId      String   @map("document_id")        /// 所属文档
+  
+  // ===== 核心内容 =====
+  content         String   @db.Text                   /// 切片文本（Markdown）
+  chunkIndex      Int      @map("chunk_index")        /// 切片序号（从 0 开始）
+  
+  // ===== 向量 =====
+  /// pgvector 1024 维向量
+  /// 注意：需要手动创建 HNSW 索引
+  embedding       Unsupported("vector(1024)")?
+  
+  // ===== 溯源信息（可选）=====
+  pageNumber      Int?     @map("page_number")        /// 页码（PDF 溯源）
+  sectionType     String?  @map("section_type")       /// 章节类型
+  
+  // ===== 扩展元数据（可选）=====
+  metadata        Json?    @db.JsonB                  /// 切片级元数据 JSONB
+  
+  document        EkbDocument @relation(fields: [documentId], references: [id], onDelete: Cascade)
+  
+  createdAt       DateTime @default(now()) @map("created_at")
+
+  @@index([documentId], map: "idx_ekb_chunk_doc")
+  @@map("ekb_chunk")
+  @@schema("ekb_schema")
+}