Files
AIclinicalresearch/backend/compare_db.ts
HaHafeng 40c2f8e148 feat(rag): Complete RAG engine implementation with pgvector
Major Features:
- Created ekb_schema (13th schema) with 3 tables: KB/Document/Chunk
- Implemented EmbeddingService (text-embedding-v4, 1024-dim vectors)
- Implemented ChunkService (smart Markdown chunking)
- Implemented VectorSearchService (multi-query + hybrid search)
- Implemented RerankService (qwen3-rerank)
- Integrated DeepSeek V3 QueryRewriter for cross-language search
- Python service: Added pymupdf4llm for PDF-to-Markdown conversion
- PKB: Dual-mode adapter (pgvector/dify/hybrid)

Architecture:
- Brain-Hand Model: Business layer (DeepSeek) + Engine layer (pgvector)
- Cross-language support: Chinese query matches English documents
- Small Embedding (1024) + Strong Reranker strategy

Performance:
- End-to-end latency: 2.5s
- Cost per query: 0.0025 RMB
- Accuracy improvement: +20.5% (cross-language)

Tests:
- test-embedding-service.ts: Vector embedding verified
- test-rag-e2e.ts: Full pipeline tested
- test-rerank.ts: Rerank quality validated
- test-query-rewrite.ts: Cross-language search verified
- test-pdf-ingest.ts: Real PDF document tested (Dongen 2003.pdf)

Documentation:
- Added 05-RAG-Engine-User-Guide.md
- Added 02-Document-Processing-User-Guide.md
- Updated system status documentation

Status: Production ready
2026-01-21 20:24:29 +08:00

125 lines
3.7 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { PrismaClient } from '@prisma/client';
const prisma = new PrismaClient();
async function main() {
console.log('🔍 数据库差异分析\n');
console.log('=' .repeat(60));
// 备份文件(2025-12-24)中应该存在的表
const backupTables = [
// aia_schema
'aia_schema.conversations',
'aia_schema.general_conversations',
'aia_schema.general_messages',
'aia_schema.messages',
'aia_schema.projects',
// asl_schema
'asl_schema.fulltext_screening_results',
'asl_schema.fulltext_screening_tasks',
'asl_schema.literatures',
'asl_schema.screening_projects',
'asl_schema.screening_results',
'asl_schema.screening_tasks',
// dc_schema
'dc_schema.dc_extraction_items',
'dc_schema.dc_extraction_tasks',
'dc_schema.dc_health_checks',
'dc_schema.dc_templates',
'dc_schema.dc_tool_c_ai_history',
'dc_schema.dc_tool_c_sessions',
// pkb_schema
'pkb_schema.batch_results',
'pkb_schema.batch_tasks',
'pkb_schema.documents',
'pkb_schema.knowledge_bases',
'pkb_schema.task_templates',
// platform_schema
'platform_schema.app_cache',
'platform_schema.job',
'platform_schema.job_common', // 可能缺失
'platform_schema.queue',
'platform_schema.schedule',
'platform_schema.subscription',
'platform_schema.users',
'platform_schema.version',
// public
'public._prisma_migrations',
'public.admin_logs',
'public.review_tasks', // 可能被移动到 rvw_schema
'public.users',
];
console.log('\n📋 检查备份中的表是否在当前数据库中存在:\n');
for (const table of backupTables) {
const [schema, tableName] = table.split('.');
try {
const result: any = await prisma.$queryRawUnsafe(
`SELECT COUNT(*) as count FROM information_schema.tables
WHERE table_schema = '${schema}' AND table_name = '${tableName}'`
);
if (result[0].count === 0n) {
console.log(`${table} - 不存在!`);
} else {
console.log(`${table} - 存在`);
}
} catch (e: any) {
console.log(`${table} - 查询失败: ${e.message}`);
}
}
// 检查 platform_schema.users 的列结构差异
console.log('\n\n📋 platform_schema.users 当前列结构:\n');
const cols: any[] = await prisma.$queryRaw`
SELECT column_name, data_type, is_nullable, column_default
FROM information_schema.columns
WHERE table_schema = 'platform_schema' AND table_name = 'users'
ORDER BY ordinal_position;
`;
cols.forEach(c => {
console.log(` ${c.column_name}: ${c.data_type} ${c.is_nullable === 'NO' ? 'NOT NULL' : 'NULLABLE'} ${c.column_default ? `DEFAULT ${c.column_default}` : ''}`);
});
// 备份中 platform_schema.users 应有的列
const originalUserColumns = ['id', 'email', 'password', 'name', 'avatar_url', 'role', 'status', 'kb_quota', 'kb_used', 'trial_ends_at', 'is_trial', 'last_login_at', 'created_at', 'updated_at'];
console.log('\n📋 对比 platform_schema.users 与备份:');
console.log(' 原始列(备份): ' + originalUserColumns.join(', '));
console.log(' 当前列: ' + cols.map(c => c.column_name).join(', '));
const currentColNames = cols.map(c => c.column_name);
const missingInCurrent = originalUserColumns.filter(c => !currentColNames.includes(c));
const newInCurrent = currentColNames.filter(c => !originalUserColumns.includes(c));
if (missingInCurrent.length > 0) {
console.log('\n ⚠️ 备份中有但当前缺失的列: ' + missingInCurrent.join(', '));
}
if (newInCurrent.length > 0) {
console.log(' 当前新增的列: ' + newInCurrent.join(', '));
}
console.log('\n' + '=' .repeat(60));
}
main()
.catch(console.error)
.finally(() => prisma.$disconnect());