From d96cdf3fe87454d566e1129decd937882453dbc6 Mon Sep 17 00:00:00 2001 From: HaHafeng Date: Tue, 10 Mar 2026 13:15:36 +0800 Subject: [PATCH] fix(aia): stabilize attachment parsing and expand spreadsheet support Align AIA attachment handling with actual extractor capability by adding xlsx/csv gray support, guiding doc/xls users to convert formats, and enforcing attachment-only answering to prevent system knowledge leakage. Made-with: Cursor --- .../src/common/document/ExtractionClient.ts | 48 +++++++++++++++++++ .../aia/controllers/attachmentController.ts | 9 +++- .../modules/aia/services/attachmentService.ts | 39 +++++++++++---- .../aia/services/conversationService.ts | 10 ++++ backend/src/modules/aia/types/index.ts | 4 +- docs/05-部署文档/03-待部署变更清单.md | 4 +- .../modules/aia/components/ChatWorkspace.tsx | 22 ++++++--- frontend-v2/src/modules/aia/constants.ts | 1 + 8 files changed, 118 insertions(+), 19 deletions(-) diff --git a/backend/src/common/document/ExtractionClient.ts b/backend/src/common/document/ExtractionClient.ts index 67302dae..73c6708d 100644 --- a/backend/src/common/document/ExtractionClient.ts +++ b/backend/src/common/document/ExtractionClient.ts @@ -31,6 +31,19 @@ export interface ExtractionResult { error?: string; } +export interface MarkdownExtractionResult { + success: boolean; + text?: string; + format?: string; + metadata?: { + original_file_type?: string; + char_count?: number; + filename?: string; + [key: string]: unknown; + }; + error?: string; +} + /** * 数据侦探结果(Python 返回) */ @@ -95,6 +108,7 @@ export interface IExtractionClient { extractTxt(file: Buffer, filename: string): Promise; detectLanguage(file: Buffer, filename: string): Promise<{ language: string; chinese_ratio: number; chinese_chars: number; total_chars: number }>; getPdfStrategy(file: Buffer, filename: string): Promise<{ detected_language: string; recommended_method: string; reason: string; nougat_available: boolean }>; + convertToMarkdown(file: Buffer, filename: string): Promise; analyzeDocx(filePath: string, config: ForensicsConfig): Promise; } @@ -332,6 +346,40 @@ class ExtractionClient implements IExtractionClient { } } + /** + * 通用文档转 Markdown(用于 .xlsx 等灰度支持格式) + */ + async convertToMarkdown( + file: Buffer, + filename: string + ): Promise { + try { + const formData = new FormData(); + formData.append('file', file, filename); + + const response = await axios.post( + `${this.baseUrl}/api/document/to-markdown`, + formData, + { + headers: { + ...formData.getHeaders(), + }, + timeout: 180000, // 3 分钟超时(Excel 大文件转换) + } + ); + + return response.data; + } catch (error) { + console.error('[ExtractionClient] Convert to markdown failed:', error); + + if (axios.isAxiosError(error) && error.response) { + throw new Error(`Document markdown conversion failed: ${error.response.data.detail || error.message}`); + } + + throw new Error('Document markdown conversion failed'); + } + } + /** * 🆕 数据侦探 API - 分析 Word 文档 * 提取表格并进行数据验证(L1 算术 + L2 统计 + L2.5 一致性) diff --git a/backend/src/modules/aia/controllers/attachmentController.ts b/backend/src/modules/aia/controllers/attachmentController.ts index fe572327..3ddcfbb9 100644 --- a/backend/src/modules/aia/controllers/attachmentController.ts +++ b/backend/src/modules/aia/controllers/attachmentController.ts @@ -79,10 +79,15 @@ export async function uploadAttachment( stack: error instanceof Error ? error.stack : undefined, }); - return reply.status(500).send({ + const isValidationError = + errorMessage.includes('不支持') || + errorMessage.includes('文件类型') || + errorMessage.includes('请在 Word 中另存为 .docx'); + + return reply.status(isValidationError ? 400 : 500).send({ code: -1, error: { - code: 'INTERNAL_ERROR', + code: isValidationError ? 'VALIDATION_ERROR' : 'INTERNAL_ERROR', message: errorMessage, }, }); diff --git a/backend/src/modules/aia/services/attachmentService.ts b/backend/src/modules/aia/services/attachmentService.ts index 6a3855f7..38a46b3a 100644 --- a/backend/src/modules/aia/services/attachmentService.ts +++ b/backend/src/modules/aia/services/attachmentService.ts @@ -24,7 +24,7 @@ const ATTACHMENT_CACHE_TTL = 2 * 60 * 60; // 2小时 const MAX_ATTACHMENTS = 5; const MAX_TOKENS_PER_ATTACHMENT = 30000; // 单个附件最大 30k Token -const ALLOWED_FILE_TYPES = ['pdf', 'docx', 'txt', 'xlsx', 'doc']; +const ALLOWED_FILE_TYPES = ['pdf', 'docx', 'txt', 'xlsx', 'csv']; interface AttachmentQueryScope { userId?: string; @@ -33,6 +33,7 @@ interface AttachmentQueryScope { interface AiaAttachmentTextRecord { id: string; + filename: string; textContent: string | null; extractStatus: 'success' | 'failed' | 'empty' | string; extractError: string | null; @@ -61,7 +62,13 @@ export async function uploadAttachment( // 1. 验证文件类型 const ext = file.filename.split('.').pop()?.toLowerCase(); if (!ext || !ALLOWED_FILE_TYPES.includes(ext)) { - throw new Error(`不支持的文件类型: ${ext}。支持: ${ALLOWED_FILE_TYPES.join(', ')}`); + if (ext === 'doc') { + throw new Error('系统为了保证文档解析的精确度,当前仅支持 .docx 格式。请您在本地 Word 中打开该文件,选择“另存为 -> Word 文档 (.docx)”后再次上传。'); + } + if (ext === 'xls') { + throw new Error('系统为了保证表格解析的稳定性,当前仅支持 .xlsx 格式。请您在本地 Excel/WPS 中打开该文件,选择“另存为 -> Excel 工作簿 (.xlsx)”后再次上传。'); + } + throw new Error(`不支持的文件类型: .${ext || 'unknown'}。当前支持:.pdf、.docx、.txt、.xlsx、.csv`); } // 2. 上传到存储服务 @@ -97,8 +104,20 @@ export async function uploadAttachment( let result; if (ext === 'pdf') { result = await extractionClient.extractPdf(file.buffer, file.filename); - } else if (ext === 'docx' || ext === 'doc') { + } else if (ext === 'docx') { result = await extractionClient.extractDocx(file.buffer, file.filename); + } else if (ext === 'xlsx' || ext === 'csv') { + const markdownResult = await extractionClient.convertToMarkdown(file.buffer, file.filename); + result = { + success: markdownResult.success, + method: ext === 'csv' ? 'markdown-csv' : 'markdown-excel', + text: markdownResult.text || '', + metadata: { + filename: file.filename, + ...(markdownResult.metadata || {}), + }, + error: markdownResult.error, + }; } else { result = await extractionClient.extractDocument(file.buffer, file.filename); } @@ -253,6 +272,7 @@ export async function getAttachmentsText( where, select: { id: true, + filename: true, textContent: true, extractStatus: true, extractError: true, @@ -265,29 +285,30 @@ export async function getAttachmentsText( try { const cacheKey = `${ATTACHMENT_CACHE_PREFIX}${attachmentId}`; const text = await cache.get(cacheKey); + const record = recordMap.get(attachmentId); + const displayName = record?.filename || attachmentId; if (text) { - texts.push(`【附件: ${attachmentId}】\n${text}`); + texts.push(`【附件: ${displayName}】\n${text}`); logger.debug('[AIA:AttachmentService] 从缓存获取附件文本成功', { attachmentId, textLength: text.length, }); } else { - const record = recordMap.get(attachmentId); logger.warn('[AIA:AttachmentService] 附件文本缓存未命中,尝试数据库回源', { attachmentId, hasDbRecord: !!record, }); if (record?.extractStatus === 'success' && record.textContent) { - texts.push(`【附件: ${attachmentId}】\n${record.textContent}`); + texts.push(`【附件: ${displayName}】\n${record.textContent}`); await cache.set(cacheKey, record.textContent, ATTACHMENT_CACHE_TTL); } else if (record?.extractStatus === 'failed') { - texts.push(`【附件: ${attachmentId}】\n[附件内容提取失败:${record.extractError || '请重新上传附件'}]`); + texts.push(`【附件: ${displayName}】\n[附件内容提取失败:${record.extractError || '请重新上传附件'}]`); } else if (record?.extractStatus === 'empty') { - texts.push(`【附件: ${attachmentId}】\n[附件内容为空或无法提取有效文本]`); + texts.push(`【附件: ${displayName}】\n[附件内容为空或无法提取有效文本]`); } else { - texts.push(`【附件: ${attachmentId}】\n[附件内容不存在或未就绪]`); + texts.push(`【附件: ${displayName}】\n[附件内容不存在或未就绪]`); } } } catch (error) { diff --git a/backend/src/modules/aia/services/conversationService.ts b/backend/src/modules/aia/services/conversationService.ts index b9e547d7..b7e5ac67 100644 --- a/backend/src/modules/aia/services/conversationService.ts +++ b/backend/src/modules/aia/services/conversationService.ts @@ -31,6 +31,11 @@ import type { const DEFAULT_MODEL = 'deepseek-v3'; const MAX_CONTEXT_MESSAGES = 20; const MAX_CONTEXT_TOKENS = 8000; +const ATTACHMENT_QA_GUARD_PROMPT = `当本轮对话包含“附件内容”时,你必须严格遵守: +1) 仅基于本轮提供的“附件内容”和用户问题作答。 +2) 严禁引用或假设任何“系统知识库/预置知识文档/隐藏资料”内容。 +3) 若附件内容提取失败、为空或与问题无关,必须明确告知用户并请其重新上传可解析文件(建议 .pdf/.docx/.txt/.xlsx/.csv)。 +4) 禁止编造你“已阅读某知识库文档”或“基于后台知识库整理”。`; // ==================== 对话管理 ==================== @@ -313,6 +318,11 @@ export async function sendMessageStream( if (attachmentIds && attachmentIds.length > 0) { const attachmentText = await getAttachmentText(attachmentIds, userId, conversationId); if (attachmentText) { + // 附件问答加严护栏:禁止模型引用系统知识库或虚构背景文档 + contextMessages.push({ + role: 'system', + content: ATTACHMENT_QA_GUARD_PROMPT, + }); userContent = `${content}\n\n---\n附件内容:\n${attachmentText}`; } } diff --git a/backend/src/modules/aia/types/index.ts b/backend/src/modules/aia/types/index.ts index 44495cb2..51b27b26 100644 --- a/backend/src/modules/aia/types/index.ts +++ b/backend/src/modules/aia/types/index.ts @@ -168,8 +168,10 @@ export const ATTACHMENT_CONFIG = { 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'text/plain', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'text/csv', + 'application/csv', ], - supportedExtensions: ['pdf', 'docx', 'txt', 'xlsx'], + supportedExtensions: ['pdf', 'docx', 'txt', 'xlsx', 'csv'], }; // ==================== API 响应格式 ==================== diff --git a/docs/05-部署文档/03-待部署变更清单.md b/docs/05-部署文档/03-待部署变更清单.md index 6bc8969b..c3d1a0c7 100644 --- a/docs/05-部署文档/03-待部署变更清单.md +++ b/docs/05-部署文档/03-待部署变更清单.md @@ -4,7 +4,7 @@ > **维护规则**: 每次修改 Schema / 新增依赖 / 改配置时,**立即**在此文档追加记录 > **Cursor Rule**: `.cursor/rules/deployment-change-tracking.mdc` 会自动提醒 > **最后清零**: 2026-03-09(0309 二次部署完成后清零) -> **本次变更**: 用户直授权限体系 + 运营埋点增强 + 运营看板 MAU/Token(2026-03-10) +> **本次变更**: 用户直授权限体系 + 运营埋点增强 + 运营看板 MAU/Token + AIA 附件格式优化(2026-03-10) --- @@ -29,6 +29,7 @@ | BE-4 | 运营埋点覆盖 6 大模块 | `deepResearchController.ts`, `reviewController.ts`, `AIController.ts`, `StreamAIController.ts`, `SessionController.ts`, `QuickActionController.ts`, `iitBatchController.ts`, `auth.controller.ts`, `auth.routes.ts` | 重新构建镜像 | ASL/RVW/DC/IIT/AIA/SYSTEM 埋点;DC 模块覆盖上传/流式AI/快速操作/非流式AI 全部 4 个入口 + 前端通用上报接口 `POST /api/v1/auth/activity` | | BE-5 | 运营看板增强(MAU/Token/最活跃用户) | `activity.service.ts`, `statsController.ts` | 重新构建镜像 | `getTodayOverview` 新增 MAU、apiTokenTotal、topActiveUser | | BE-6 | 埋点验证脚本 | `scripts/verify-activity-tracking.ts` | 无需部署 | `npm run test:tracking` 开发/运维自测用 | +| BE-7 | AIA 附件格式能力更新(支持 `.xlsx/.csv`,`.doc/.xls` 友好提示) | `modules/aia/services/attachmentService.ts`, `modules/aia/services/conversationService.ts`, `modules/aia/types/index.ts`, `common/document/ExtractionClient.ts` | 重新构建镜像 | `.xlsx/.csv` 走 `document/to-markdown` 灰度解析;统一附件问答护栏,防止引用系统知识库 | ### 前端变更 @@ -39,6 +40,7 @@ | FE-3 | 用户管理 API 层新增权限接口 | `userApi.ts` | 重新构建镜像 | `getUserDirectPermissions` / `updateUserDirectPermissions` / `getPermissionOptions` | | FE-4 | 运营看板展示 MAU/Token/最活跃用户 | `StatsDashboardPage.tsx`, `statsApi.ts` | 重新构建镜像 | 新增 4 个统计卡片 | | FE-5 | 顶部导航点击埋点上报 | `TopNavigation.tsx` | 重新构建镜像 | 点击模块导航时 fire-and-forget 上报 | +| FE-6 | AIA 上传交互更新(放开 `.xlsx/.csv`,`.doc/.xls` 友好提示) | `modules/aia/components/ChatWorkspace.tsx`, `modules/aia/constants.ts` | 重新构建镜像 | 上传白名单与提示文案同步后端,附件入口文案更新 | ### Python 微服务变更 diff --git a/frontend-v2/src/modules/aia/components/ChatWorkspace.tsx b/frontend-v2/src/modules/aia/components/ChatWorkspace.tsx index 50c19f89..272fcb9d 100644 --- a/frontend-v2/src/modules/aia/components/ChatWorkspace.tsx +++ b/frontend-v2/src/modules/aia/components/ChatWorkspace.tsx @@ -42,6 +42,7 @@ const getFileIcon = (filename: string): string => { docx: 'word', xls: 'excel', xlsx: 'excel', + csv: 'excel', ppt: 'ppt', pptx: 'ppt', txt: 'default', @@ -421,11 +422,17 @@ export const ChatWorkspace: React.FC = ({ } const file = files[0]; - const allowedTypes = ['.pdf', '.docx', '.doc', '.txt', '.xlsx']; + const allowedTypes = ['.pdf', '.docx', '.txt', '.xlsx', '.csv']; const ext = '.' + file.name.split('.').pop()?.toLowerCase(); if (!allowedTypes.includes(ext)) { - alert(`不支持的文件类型: ${ext}\n支持的类型: ${allowedTypes.join(', ')}`); + if (ext === '.doc') { + alert('系统为了保证文档解析的精确度,当前仅支持 .docx 格式。请您在本地 Word 中打开该文件,选择“另存为 -> Word 文档 (.docx)”后再次上传。'); + } else if (ext === '.xls') { + alert('系统为了保证表格解析的稳定性,当前仅支持 .xlsx 格式。请您在本地 Excel/WPS 中打开该文件,选择“另存为 -> Excel 工作簿 (.xlsx)”后再次上传。'); + } else { + alert(`不支持的文件类型: ${ext}\n当前支持: ${allowedTypes.join(', ')}`); + } return; } @@ -453,7 +460,9 @@ export const ChatWorkspace: React.FC = ({ }); if (!response.ok) { - throw new Error(`上传失败: ${response.status}`); + const errorJson = await response.json().catch(() => null); + const backendMessage = errorJson?.error?.message; + throw new Error(backendMessage || `上传失败: ${response.status}`); } const result = await response.json(); @@ -469,7 +478,8 @@ export const ChatWorkspace: React.FC = ({ console.error('[ChatWorkspace] 上传附件失败:', error); // 移除失败的附件 setAttachments(prev => prev.filter(a => a.id !== tempId)); - alert('附件上传失败,请重试'); + const message = error instanceof Error ? error.message : '附件上传失败,请重试'; + alert(message); } finally { setIsUploading(false); // 清空 input,允许重复选择同一文件 @@ -780,7 +790,7 @@ export const ChatWorkspace: React.FC = ({ className={`attachment-btn ${attachments.length > 0 ? 'has-attachments' : ''}`} onClick={handleAttachmentClick} disabled={isUploading} - title="添加附件(PDF、Word、TXT、Excel)" + title="添加附件(PDF、DOCX、TXT、XLSX、CSV)" > {attachments.length > 0 && ( @@ -818,7 +828,7 @@ export const ChatWorkspace: React.FC = ({ diff --git a/frontend-v2/src/modules/aia/constants.ts b/frontend-v2/src/modules/aia/constants.ts index 037eb80d..bdb6c09a 100644 --- a/frontend-v2/src/modules/aia/constants.ts +++ b/frontend-v2/src/modules/aia/constants.ts @@ -186,6 +186,7 @@ export const BRAND_COLORS = { teal: '#0D9488', purple: '#9333EA', yellow: '#CA8A04', + indigo: '#6366F1', } as const;