fix(aia): stabilize attachment parsing and expand spreadsheet support

Align AIA attachment handling with actual extractor capability by adding xlsx/csv gray support, guiding doc/xls users to convert formats, and enforcing attachment-only answering to prevent system knowledge leakage.

Made-with: Cursor
This commit is contained in:
2026-03-10 13:15:36 +08:00
parent 097e7920ab
commit d96cdf3fe8
8 changed files with 118 additions and 19 deletions

View File

@@ -31,6 +31,19 @@ export interface ExtractionResult {
error?: string;
}
export interface MarkdownExtractionResult {
success: boolean;
text?: string;
format?: string;
metadata?: {
original_file_type?: string;
char_count?: number;
filename?: string;
[key: string]: unknown;
};
error?: string;
}
/**
* 数据侦探结果Python 返回)
*/
@@ -95,6 +108,7 @@ export interface IExtractionClient {
extractTxt(file: Buffer, filename: string): Promise<ExtractionResult>;
detectLanguage(file: Buffer, filename: string): Promise<{ language: string; chinese_ratio: number; chinese_chars: number; total_chars: number }>;
getPdfStrategy(file: Buffer, filename: string): Promise<{ detected_language: string; recommended_method: string; reason: string; nougat_available: boolean }>;
convertToMarkdown(file: Buffer, filename: string): Promise<MarkdownExtractionResult>;
analyzeDocx(filePath: string, config: ForensicsConfig): Promise<ForensicsResult>;
}
@@ -332,6 +346,40 @@ class ExtractionClient implements IExtractionClient {
}
}
/**
* 通用文档转 Markdown用于 .xlsx 等灰度支持格式)
*/
async convertToMarkdown(
file: Buffer,
filename: string
): Promise<MarkdownExtractionResult> {
try {
const formData = new FormData();
formData.append('file', file, filename);
const response = await axios.post<MarkdownExtractionResult>(
`${this.baseUrl}/api/document/to-markdown`,
formData,
{
headers: {
...formData.getHeaders(),
},
timeout: 180000, // 3 分钟超时Excel 大文件转换)
}
);
return response.data;
} catch (error) {
console.error('[ExtractionClient] Convert to markdown failed:', error);
if (axios.isAxiosError(error) && error.response) {
throw new Error(`Document markdown conversion failed: ${error.response.data.detail || error.message}`);
}
throw new Error('Document markdown conversion failed');
}
}
/**
* 🆕 数据侦探 API - 分析 Word 文档
* 提取表格并进行数据验证L1 算术 + L2 统计 + L2.5 一致性)

View File

@@ -79,10 +79,15 @@ export async function uploadAttachment(
stack: error instanceof Error ? error.stack : undefined,
});
return reply.status(500).send({
const isValidationError =
errorMessage.includes('不支持') ||
errorMessage.includes('文件类型') ||
errorMessage.includes('请在 Word 中另存为 .docx');
return reply.status(isValidationError ? 400 : 500).send({
code: -1,
error: {
code: 'INTERNAL_ERROR',
code: isValidationError ? 'VALIDATION_ERROR' : 'INTERNAL_ERROR',
message: errorMessage,
},
});

View File

@@ -24,7 +24,7 @@ const ATTACHMENT_CACHE_TTL = 2 * 60 * 60; // 2小时
const MAX_ATTACHMENTS = 5;
const MAX_TOKENS_PER_ATTACHMENT = 30000; // 单个附件最大 30k Token
const ALLOWED_FILE_TYPES = ['pdf', 'docx', 'txt', 'xlsx', 'doc'];
const ALLOWED_FILE_TYPES = ['pdf', 'docx', 'txt', 'xlsx', 'csv'];
interface AttachmentQueryScope {
userId?: string;
@@ -33,6 +33,7 @@ interface AttachmentQueryScope {
interface AiaAttachmentTextRecord {
id: string;
filename: string;
textContent: string | null;
extractStatus: 'success' | 'failed' | 'empty' | string;
extractError: string | null;
@@ -61,7 +62,13 @@ export async function uploadAttachment(
// 1. 验证文件类型
const ext = file.filename.split('.').pop()?.toLowerCase();
if (!ext || !ALLOWED_FILE_TYPES.includes(ext)) {
throw new Error(`不支持的文件类型: ${ext}。支持: ${ALLOWED_FILE_TYPES.join(', ')}`);
if (ext === 'doc') {
throw new Error('系统为了保证文档解析的精确度,当前仅支持 .docx 格式。请您在本地 Word 中打开该文件,选择“另存为 -> Word 文档 (.docx)”后再次上传。');
}
if (ext === 'xls') {
throw new Error('系统为了保证表格解析的稳定性,当前仅支持 .xlsx 格式。请您在本地 Excel/WPS 中打开该文件,选择“另存为 -> Excel 工作簿 (.xlsx)”后再次上传。');
}
throw new Error(`不支持的文件类型: .${ext || 'unknown'}。当前支持:.pdf、.docx、.txt、.xlsx、.csv`);
}
// 2. 上传到存储服务
@@ -97,8 +104,20 @@ export async function uploadAttachment(
let result;
if (ext === 'pdf') {
result = await extractionClient.extractPdf(file.buffer, file.filename);
} else if (ext === 'docx' || ext === 'doc') {
} else if (ext === 'docx') {
result = await extractionClient.extractDocx(file.buffer, file.filename);
} else if (ext === 'xlsx' || ext === 'csv') {
const markdownResult = await extractionClient.convertToMarkdown(file.buffer, file.filename);
result = {
success: markdownResult.success,
method: ext === 'csv' ? 'markdown-csv' : 'markdown-excel',
text: markdownResult.text || '',
metadata: {
filename: file.filename,
...(markdownResult.metadata || {}),
},
error: markdownResult.error,
};
} else {
result = await extractionClient.extractDocument(file.buffer, file.filename);
}
@@ -253,6 +272,7 @@ export async function getAttachmentsText(
where,
select: {
id: true,
filename: true,
textContent: true,
extractStatus: true,
extractError: true,
@@ -265,29 +285,30 @@ export async function getAttachmentsText(
try {
const cacheKey = `${ATTACHMENT_CACHE_PREFIX}${attachmentId}`;
const text = await cache.get<string>(cacheKey);
const record = recordMap.get(attachmentId);
const displayName = record?.filename || attachmentId;
if (text) {
texts.push(`【附件: ${attachmentId}\n${text}`);
texts.push(`【附件: ${displayName}\n${text}`);
logger.debug('[AIA:AttachmentService] 从缓存获取附件文本成功', {
attachmentId,
textLength: text.length,
});
} else {
const record = recordMap.get(attachmentId);
logger.warn('[AIA:AttachmentService] 附件文本缓存未命中,尝试数据库回源', {
attachmentId,
hasDbRecord: !!record,
});
if (record?.extractStatus === 'success' && record.textContent) {
texts.push(`【附件: ${attachmentId}\n${record.textContent}`);
texts.push(`【附件: ${displayName}\n${record.textContent}`);
await cache.set(cacheKey, record.textContent, ATTACHMENT_CACHE_TTL);
} else if (record?.extractStatus === 'failed') {
texts.push(`【附件: ${attachmentId}\n[附件内容提取失败:${record.extractError || '请重新上传附件'}]`);
texts.push(`【附件: ${displayName}\n[附件内容提取失败:${record.extractError || '请重新上传附件'}]`);
} else if (record?.extractStatus === 'empty') {
texts.push(`【附件: ${attachmentId}\n[附件内容为空或无法提取有效文本]`);
texts.push(`【附件: ${displayName}\n[附件内容为空或无法提取有效文本]`);
} else {
texts.push(`【附件: ${attachmentId}\n[附件内容不存在或未就绪]`);
texts.push(`【附件: ${displayName}\n[附件内容不存在或未就绪]`);
}
}
} catch (error) {

View File

@@ -31,6 +31,11 @@ import type {
const DEFAULT_MODEL = 'deepseek-v3';
const MAX_CONTEXT_MESSAGES = 20;
const MAX_CONTEXT_TOKENS = 8000;
const ATTACHMENT_QA_GUARD_PROMPT = `当本轮对话包含“附件内容”时,你必须严格遵守:
1) 仅基于本轮提供的“附件内容”和用户问题作答。
2) 严禁引用或假设任何“系统知识库/预置知识文档/隐藏资料”内容。
3) 若附件内容提取失败、为空或与问题无关,必须明确告知用户并请其重新上传可解析文件(建议 .pdf/.docx/.txt/.xlsx/.csv
4) 禁止编造你“已阅读某知识库文档”或“基于后台知识库整理”。`;
// ==================== 对话管理 ====================
@@ -313,6 +318,11 @@ export async function sendMessageStream(
if (attachmentIds && attachmentIds.length > 0) {
const attachmentText = await getAttachmentText(attachmentIds, userId, conversationId);
if (attachmentText) {
// 附件问答加严护栏:禁止模型引用系统知识库或虚构背景文档
contextMessages.push({
role: 'system',
content: ATTACHMENT_QA_GUARD_PROMPT,
});
userContent = `${content}\n\n---\n附件内容\n${attachmentText}`;
}
}

View File

@@ -168,8 +168,10 @@ export const ATTACHMENT_CONFIG = {
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'text/plain',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'text/csv',
'application/csv',
],
supportedExtensions: ['pdf', 'docx', 'txt', 'xlsx'],
supportedExtensions: ['pdf', 'docx', 'txt', 'xlsx', 'csv'],
};
// ==================== API 响应格式 ====================

View File

@@ -4,7 +4,7 @@
> **维护规则**: 每次修改 Schema / 新增依赖 / 改配置时,**立即**在此文档追加记录
> **Cursor Rule**: `.cursor/rules/deployment-change-tracking.mdc` 会自动提醒
> **最后清零**: 2026-03-090309 二次部署完成后清零)
> **本次变更**: 用户直授权限体系 + 运营埋点增强 + 运营看板 MAU/Token2026-03-10
> **本次变更**: 用户直授权限体系 + 运营埋点增强 + 运营看板 MAU/Token + AIA 附件格式优化2026-03-10
---
@@ -29,6 +29,7 @@
| BE-4 | 运营埋点覆盖 6 大模块 | `deepResearchController.ts`, `reviewController.ts`, `AIController.ts`, `StreamAIController.ts`, `SessionController.ts`, `QuickActionController.ts`, `iitBatchController.ts`, `auth.controller.ts`, `auth.routes.ts` | 重新构建镜像 | ASL/RVW/DC/IIT/AIA/SYSTEM 埋点DC 模块覆盖上传/流式AI/快速操作/非流式AI 全部 4 个入口 + 前端通用上报接口 `POST /api/v1/auth/activity` |
| BE-5 | 运营看板增强MAU/Token/最活跃用户) | `activity.service.ts`, `statsController.ts` | 重新构建镜像 | `getTodayOverview` 新增 MAU、apiTokenTotal、topActiveUser |
| BE-6 | 埋点验证脚本 | `scripts/verify-activity-tracking.ts` | 无需部署 | `npm run test:tracking` 开发/运维自测用 |
| BE-7 | AIA 附件格式能力更新(支持 `.xlsx/.csv``.doc/.xls` 友好提示) | `modules/aia/services/attachmentService.ts`, `modules/aia/services/conversationService.ts`, `modules/aia/types/index.ts`, `common/document/ExtractionClient.ts` | 重新构建镜像 | `.xlsx/.csv``document/to-markdown` 灰度解析;统一附件问答护栏,防止引用系统知识库 |
### 前端变更
@@ -39,6 +40,7 @@
| FE-3 | 用户管理 API 层新增权限接口 | `userApi.ts` | 重新构建镜像 | `getUserDirectPermissions` / `updateUserDirectPermissions` / `getPermissionOptions` |
| FE-4 | 运营看板展示 MAU/Token/最活跃用户 | `StatsDashboardPage.tsx`, `statsApi.ts` | 重新构建镜像 | 新增 4 个统计卡片 |
| FE-5 | 顶部导航点击埋点上报 | `TopNavigation.tsx` | 重新构建镜像 | 点击模块导航时 fire-and-forget 上报 |
| FE-6 | AIA 上传交互更新(放开 `.xlsx/.csv``.doc/.xls` 友好提示) | `modules/aia/components/ChatWorkspace.tsx`, `modules/aia/constants.ts` | 重新构建镜像 | 上传白名单与提示文案同步后端,附件入口文案更新 |
### Python 微服务变更

View File

@@ -42,6 +42,7 @@ const getFileIcon = (filename: string): string => {
docx: 'word',
xls: 'excel',
xlsx: 'excel',
csv: 'excel',
ppt: 'ppt',
pptx: 'ppt',
txt: 'default',
@@ -421,11 +422,17 @@ export const ChatWorkspace: React.FC<ChatWorkspaceProps> = ({
}
const file = files[0];
const allowedTypes = ['.pdf', '.docx', '.doc', '.txt', '.xlsx'];
const allowedTypes = ['.pdf', '.docx', '.txt', '.xlsx', '.csv'];
const ext = '.' + file.name.split('.').pop()?.toLowerCase();
if (!allowedTypes.includes(ext)) {
alert(`不支持的文件类型: ${ext}\n支持的类型: ${allowedTypes.join(', ')}`);
if (ext === '.doc') {
alert('系统为了保证文档解析的精确度,当前仅支持 .docx 格式。请您在本地 Word 中打开该文件,选择“另存为 -> Word 文档 (.docx)”后再次上传。');
} else if (ext === '.xls') {
alert('系统为了保证表格解析的稳定性,当前仅支持 .xlsx 格式。请您在本地 Excel/WPS 中打开该文件,选择“另存为 -> Excel 工作簿 (.xlsx)”后再次上传。');
} else {
alert(`不支持的文件类型: ${ext}\n当前支持: ${allowedTypes.join(', ')}`);
}
return;
}
@@ -453,7 +460,9 @@ export const ChatWorkspace: React.FC<ChatWorkspaceProps> = ({
});
if (!response.ok) {
throw new Error(`上传失败: ${response.status}`);
const errorJson = await response.json().catch(() => null);
const backendMessage = errorJson?.error?.message;
throw new Error(backendMessage || `上传失败: ${response.status}`);
}
const result = await response.json();
@@ -469,7 +478,8 @@ export const ChatWorkspace: React.FC<ChatWorkspaceProps> = ({
console.error('[ChatWorkspace] 上传附件失败:', error);
// 移除失败的附件
setAttachments(prev => prev.filter(a => a.id !== tempId));
alert('附件上传失败,请重试');
const message = error instanceof Error ? error.message : '附件上传失败,请重试';
alert(message);
} finally {
setIsUploading(false);
// 清空 input允许重复选择同一文件
@@ -780,7 +790,7 @@ export const ChatWorkspace: React.FC<ChatWorkspaceProps> = ({
className={`attachment-btn ${attachments.length > 0 ? 'has-attachments' : ''}`}
onClick={handleAttachmentClick}
disabled={isUploading}
title="添加附件PDF、Word、TXT、Excel"
title="添加附件PDF、DOCX、TXT、XLSX、CSV"
>
<Paperclip size={16} />
{attachments.length > 0 && (
@@ -818,7 +828,7 @@ export const ChatWorkspace: React.FC<ChatWorkspaceProps> = ({
<input
ref={fileInputRef}
type="file"
accept=".pdf,.docx,.doc,.txt,.xlsx"
accept=".pdf,.docx,.txt,.xlsx,.xls,.csv"
onChange={handleFileChange}
style={{ display: 'none' }}
/>

View File

@@ -186,6 +186,7 @@ export const BRAND_COLORS = {
teal: '#0D9488',
purple: '#9333EA',
yellow: '#CA8A04',
indigo: '#6366F1',
} as const;