Files
HaHafeng 88cc049fb3 feat(asl): Complete Day 5 - Fulltext Screening Backend API Development
- Implement 5 core API endpoints (create task, get progress, get results, update decision, export Excel)
- Add FulltextScreeningController with Zod validation (652 lines)
- Implement ExcelExporter service with 4-sheet report generation (352 lines)
- Register routes under /api/v1/asl/fulltext-screening
- Create 31 REST Client test cases
- Add automated integration test script
- Fix PDF extraction fallback mechanism in LLM12FieldsService
- Update API design documentation to v3.0
- Update development plan to v1.2
- Create Day 5 development record
- Clean up temporary test files
2025-11-23 10:52:07 +08:00

425 lines
14 KiB
PL/PgSQL
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- ========================================
-- 004-migrate-pkb.sql
-- ========================================
-- 目的迁移pkb_schema个人知识库模块
-- 迁移表5个knowledge_bases, documents, batch_tasks, batch_results, task_templates
-- 预计时间30分钟
-- 作者AI助手
-- 日期2025-11-09
-- ========================================
-- 前置条件:
-- 1. 已执行 001-create-all-10-schemas.sql
-- 2. 已执行 002-migrate-platform.sql因为需要引用platform_schema.users
-- 3. public schema中的相关表存在且有数据
BEGIN;
-- ========================================
-- 第一步创建pkb_schema.knowledge_bases表
-- ========================================
CREATE TABLE IF NOT EXISTS pkb_schema.knowledge_bases (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
user_id UUID NOT NULL,
name VARCHAR(255) NOT NULL,
description TEXT,
dify_dataset_id VARCHAR(255) NOT NULL,
file_count INT DEFAULT 0,
total_size_bytes BIGINT DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (user_id) REFERENCES platform_schema.users(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_pkb_knowledge_bases_user_id ON pkb_schema.knowledge_bases(user_id);
CREATE INDEX IF NOT EXISTS idx_pkb_knowledge_bases_dify_dataset_id ON pkb_schema.knowledge_bases(dify_dataset_id);
-- ========================================
-- 第二步创建pkb_schema.documents表包含Phase 2字段
-- ========================================
CREATE TABLE IF NOT EXISTS pkb_schema.documents (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
kb_id UUID NOT NULL,
user_id UUID NOT NULL,
filename VARCHAR(255) NOT NULL,
file_type VARCHAR(50) NOT NULL,
file_size_bytes BIGINT NOT NULL,
file_url TEXT NOT NULL,
dify_document_id VARCHAR(255) NOT NULL,
status VARCHAR(50) DEFAULT 'uploading',
progress INT DEFAULT 0,
error_message TEXT,
segments_count INT,
tokens_count INT,
-- Phase 2全文阅读模式字段
extraction_method VARCHAR(50),
extraction_quality FLOAT,
char_count INT,
language VARCHAR(20),
extracted_text TEXT,
uploaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
processed_at TIMESTAMP,
FOREIGN KEY (kb_id) REFERENCES pkb_schema.knowledge_bases(id) ON DELETE CASCADE,
FOREIGN KEY (user_id) REFERENCES platform_schema.users(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_pkb_documents_kb_id ON pkb_schema.documents(kb_id);
CREATE INDEX IF NOT EXISTS idx_pkb_documents_user_id ON pkb_schema.documents(user_id);
CREATE INDEX IF NOT EXISTS idx_pkb_documents_status ON pkb_schema.documents(status);
CREATE INDEX IF NOT EXISTS idx_pkb_documents_dify_document_id ON pkb_schema.documents(dify_document_id);
CREATE INDEX IF NOT EXISTS idx_pkb_documents_extraction_method ON pkb_schema.documents(extraction_method);
-- ========================================
-- 第三步创建pkb_schema.batch_tasks表
-- ========================================
CREATE TABLE IF NOT EXISTS pkb_schema.batch_tasks (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
user_id UUID NOT NULL,
kb_id UUID NOT NULL,
name VARCHAR(255) NOT NULL,
template_type VARCHAR(50) NOT NULL,
template_id VARCHAR(100),
prompt TEXT NOT NULL,
status VARCHAR(50) NOT NULL,
total_documents INT NOT NULL,
completed_count INT DEFAULT 0,
failed_count INT DEFAULT 0,
model_type VARCHAR(50) NOT NULL,
concurrency INT DEFAULT 3,
started_at TIMESTAMP,
completed_at TIMESTAMP,
duration_seconds INT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (user_id) REFERENCES platform_schema.users(id) ON DELETE CASCADE,
FOREIGN KEY (kb_id) REFERENCES pkb_schema.knowledge_bases(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_pkb_batch_tasks_user_id ON pkb_schema.batch_tasks(user_id);
CREATE INDEX IF NOT EXISTS idx_pkb_batch_tasks_kb_id ON pkb_schema.batch_tasks(kb_id);
CREATE INDEX IF NOT EXISTS idx_pkb_batch_tasks_status ON pkb_schema.batch_tasks(status);
CREATE INDEX IF NOT EXISTS idx_pkb_batch_tasks_created_at ON pkb_schema.batch_tasks(created_at);
-- ========================================
-- 第四步创建pkb_schema.batch_results表
-- ========================================
CREATE TABLE IF NOT EXISTS pkb_schema.batch_results (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
task_id UUID NOT NULL,
document_id UUID NOT NULL,
status VARCHAR(50) NOT NULL,
data JSONB,
raw_output TEXT,
error_message TEXT,
processing_time_ms INT,
tokens_used INT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (task_id) REFERENCES pkb_schema.batch_tasks(id) ON DELETE CASCADE,
FOREIGN KEY (document_id) REFERENCES pkb_schema.documents(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_pkb_batch_results_task_id ON pkb_schema.batch_results(task_id);
CREATE INDEX IF NOT EXISTS idx_pkb_batch_results_document_id ON pkb_schema.batch_results(document_id);
CREATE INDEX IF NOT EXISTS idx_pkb_batch_results_status ON pkb_schema.batch_results(status);
-- ========================================
-- 第五步创建pkb_schema.task_templates表
-- ========================================
CREATE TABLE IF NOT EXISTS pkb_schema.task_templates (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
user_id UUID NOT NULL,
name VARCHAR(255) NOT NULL,
description TEXT,
prompt TEXT NOT NULL,
output_fields JSONB NOT NULL,
is_public BOOLEAN DEFAULT false,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (user_id) REFERENCES platform_schema.users(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_pkb_task_templates_user_id ON pkb_schema.task_templates(user_id);
-- ========================================
-- 第六步:迁移数据
-- ========================================
-- 6.1 迁移knowledge_bases
INSERT INTO pkb_schema.knowledge_bases (
id, user_id, name, description, dify_dataset_id,
file_count, total_size_bytes, created_at, updated_at
)
SELECT
id, user_id, name, description, dify_dataset_id,
file_count, total_size_bytes, created_at, updated_at
FROM public.knowledge_bases
ON CONFLICT (id) DO NOTHING;
-- 6.2 迁移documents包含Phase 2字段
INSERT INTO pkb_schema.documents (
id, kb_id, user_id, filename, file_type,
file_size_bytes, file_url, dify_document_id,
status, progress, error_message, segments_count, tokens_count,
extraction_method, extraction_quality, char_count,
language, extracted_text, uploaded_at, processed_at
)
SELECT
id, kb_id, user_id, filename, file_type,
file_size_bytes, file_url, dify_document_id,
status, progress, error_message, segments_count, tokens_count,
extraction_method, extraction_quality, char_count,
language, extracted_text, uploaded_at, processed_at
FROM public.documents
ON CONFLICT (id) DO NOTHING;
-- 6.3 迁移batch_tasks
INSERT INTO pkb_schema.batch_tasks (
id, user_id, kb_id, name, template_type, template_id,
prompt, status, total_documents, completed_count, failed_count,
model_type, concurrency, started_at, completed_at, duration_seconds,
created_at, updated_at
)
SELECT
id, user_id, kb_id, name, template_type, template_id,
prompt, status, total_documents, completed_count, failed_count,
model_type, concurrency, started_at, completed_at, duration_seconds,
created_at, updated_at
FROM public.batch_tasks
ON CONFLICT (id) DO NOTHING;
-- 6.4 迁移batch_results
INSERT INTO pkb_schema.batch_results (
id, task_id, document_id, status, data, raw_output,
error_message, processing_time_ms, tokens_used, created_at
)
SELECT
id, task_id, document_id, status, data, raw_output,
error_message, processing_time_ms, tokens_used, created_at
FROM public.batch_results
ON CONFLICT (id) DO NOTHING;
-- 6.5 迁移task_templates
INSERT INTO pkb_schema.task_templates (
id, user_id, name, description, prompt,
output_fields, is_public, created_at, updated_at
)
SELECT
id, user_id, name, description, prompt,
output_fields, is_public, created_at, updated_at
FROM public.task_templates
ON CONFLICT (id) DO NOTHING;
-- ========================================
-- 第七步:数据验证
-- ========================================
DO $$
DECLARE
public_knowledge_bases INTEGER;
public_documents INTEGER;
public_batch_tasks INTEGER;
public_batch_results INTEGER;
public_task_templates INTEGER;
pkb_knowledge_bases INTEGER;
pkb_documents INTEGER;
pkb_batch_tasks INTEGER;
pkb_batch_results INTEGER;
pkb_task_templates INTEGER;
all_match BOOLEAN := true;
BEGIN
-- 统计原表
SELECT COUNT(*) INTO public_knowledge_bases FROM public.knowledge_bases;
SELECT COUNT(*) INTO public_documents FROM public.documents;
SELECT COUNT(*) INTO public_batch_tasks FROM public.batch_tasks;
SELECT COUNT(*) INTO public_batch_results FROM public.batch_results;
SELECT COUNT(*) INTO public_task_templates FROM public.task_templates;
-- 统计新表
SELECT COUNT(*) INTO pkb_knowledge_bases FROM pkb_schema.knowledge_bases;
SELECT COUNT(*) INTO pkb_documents FROM pkb_schema.documents;
SELECT COUNT(*) INTO pkb_batch_tasks FROM pkb_schema.batch_tasks;
SELECT COUNT(*) INTO pkb_batch_results FROM pkb_schema.batch_results;
SELECT COUNT(*) INTO pkb_task_templates FROM pkb_schema.task_templates;
-- 输出统计
RAISE NOTICE '==================== 数据迁移统计 ====================';
RAISE NOTICE 'knowledge_bases: public.% -> pkb_schema.%', public_knowledge_bases, pkb_knowledge_bases;
RAISE NOTICE 'documents: public.% -> pkb_schema.%', public_documents, pkb_documents;
RAISE NOTICE 'batch_tasks: public.% -> pkb_schema.%', public_batch_tasks, pkb_batch_tasks;
RAISE NOTICE 'batch_results: public.% -> pkb_schema.%', public_batch_results, pkb_batch_results;
RAISE NOTICE 'task_templates: public.% -> pkb_schema.%', public_task_templates, pkb_task_templates;
RAISE NOTICE '=====================================================';
-- 验证每个表
IF public_knowledge_bases != pkb_knowledge_bases THEN
RAISE WARNING '⚠️ knowledge_bases 数据量不一致';
all_match := false;
END IF;
IF public_documents != pkb_documents THEN
RAISE WARNING '⚠️ documents 数据量不一致';
all_match := false;
END IF;
IF public_batch_tasks != pkb_batch_tasks THEN
RAISE WARNING '⚠️ batch_tasks 数据量不一致';
all_match := false;
END IF;
IF public_batch_results != pkb_batch_results THEN
RAISE WARNING '⚠️ batch_results 数据量不一致';
all_match := false;
END IF;
IF public_task_templates != pkb_task_templates THEN
RAISE WARNING '⚠️ task_templates 数据量不一致';
all_match := false;
END IF;
IF all_match THEN
RAISE NOTICE '✅ PKB Schema 所有表数据迁移成功!';
ELSE
RAISE WARNING '⚠️ 部分表数据迁移存在问题,请检查';
END IF;
END $$;
-- ========================================
-- 第八步:外键完整性验证
-- ========================================
-- 验证knowledge_bases的user_id
DO $$
DECLARE
invalid_kb_users INTEGER;
invalid_doc_users INTEGER;
invalid_doc_kb INTEGER;
BEGIN
-- 验证knowledge_bases.user_id
SELECT COUNT(*) INTO invalid_kb_users
FROM pkb_schema.knowledge_bases kb
LEFT JOIN platform_schema.users u ON kb.user_id = u.id
WHERE u.id IS NULL;
IF invalid_kb_users > 0 THEN
RAISE WARNING '⚠️ knowledge_bases表中有 % 条记录的user_id无效', invalid_kb_users;
ELSE
RAISE NOTICE '✅ knowledge_bases表user_id外键完整性验证通过';
END IF;
-- 验证documents.user_id
SELECT COUNT(*) INTO invalid_doc_users
FROM pkb_schema.documents d
LEFT JOIN platform_schema.users u ON d.user_id = u.id
WHERE u.id IS NULL;
IF invalid_doc_users > 0 THEN
RAISE WARNING '⚠️ documents表中有 % 条记录的user_id无效', invalid_doc_users;
ELSE
RAISE NOTICE '✅ documents表user_id外键完整性验证通过';
END IF;
-- 验证documents.kb_id
SELECT COUNT(*) INTO invalid_doc_kb
FROM pkb_schema.documents d
LEFT JOIN pkb_schema.knowledge_bases kb ON d.kb_id = kb.id
WHERE kb.id IS NULL;
IF invalid_doc_kb > 0 THEN
RAISE WARNING '⚠️ documents表中有 % 条记录的kb_id无效', invalid_doc_kb;
ELSE
RAISE NOTICE '✅ documents表kb_id外键完整性验证通过';
END IF;
END $$;
-- ========================================
-- 第九步Phase 2字段统计
-- ========================================
SELECT
'全文阅读字段统计' AS category,
COUNT(*) AS total_documents,
COUNT(extracted_text) AS has_extracted_text,
COUNT(extraction_method) AS has_extraction_method,
COUNT(CASE WHEN extraction_method = 'pymupdf' THEN 1 END) AS pymupdf_count,
COUNT(CASE WHEN extraction_method = 'nougat' THEN 1 END) AS nougat_count,
AVG(extraction_quality) AS avg_quality,
AVG(char_count) AS avg_char_count
FROM pkb_schema.documents;
COMMIT;
-- ========================================
-- 执行结果统计(可单独运行)
-- ========================================
SELECT
'pkb_schema' AS schema_name,
'knowledge_bases' AS table_name,
COUNT(*) AS row_count,
COUNT(DISTINCT user_id) AS unique_users,
SUM(file_count) AS total_files,
pg_size_pretty(SUM(total_size_bytes)) AS total_size
FROM pkb_schema.knowledge_bases
UNION ALL
SELECT
'pkb_schema',
'documents',
COUNT(*),
COUNT(DISTINCT user_id),
NULL,
pg_size_pretty(SUM(file_size_bytes))
FROM pkb_schema.documents
UNION ALL
SELECT
'pkb_schema',
'batch_tasks',
COUNT(*),
COUNT(DISTINCT user_id),
NULL,
NULL
FROM pkb_schema.batch_tasks;
-- ========================================
-- 完成提示
-- ========================================
-- ✅ PKB Schema 迁移完成
-- 包含5个表knowledge_bases, documents, batch_tasks,
-- batch_results, task_templates
-- 下一步:执行 005-validate-all.sql 进行全局验证
-- ========================================