fix(aia): stabilize attachment parsing and expand spreadsheet support
Align AIA attachment handling with actual extractor capability by adding xlsx/csv gray support, guiding doc/xls users to convert formats, and enforcing attachment-only answering to prevent system knowledge leakage. Made-with: Cursor
This commit is contained in:
@@ -31,6 +31,19 @@ export interface ExtractionResult {
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export interface MarkdownExtractionResult {
|
||||
success: boolean;
|
||||
text?: string;
|
||||
format?: string;
|
||||
metadata?: {
|
||||
original_file_type?: string;
|
||||
char_count?: number;
|
||||
filename?: string;
|
||||
[key: string]: unknown;
|
||||
};
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* 数据侦探结果(Python 返回)
|
||||
*/
|
||||
@@ -95,6 +108,7 @@ export interface IExtractionClient {
|
||||
extractTxt(file: Buffer, filename: string): Promise<ExtractionResult>;
|
||||
detectLanguage(file: Buffer, filename: string): Promise<{ language: string; chinese_ratio: number; chinese_chars: number; total_chars: number }>;
|
||||
getPdfStrategy(file: Buffer, filename: string): Promise<{ detected_language: string; recommended_method: string; reason: string; nougat_available: boolean }>;
|
||||
convertToMarkdown(file: Buffer, filename: string): Promise<MarkdownExtractionResult>;
|
||||
analyzeDocx(filePath: string, config: ForensicsConfig): Promise<ForensicsResult>;
|
||||
}
|
||||
|
||||
@@ -332,6 +346,40 @@ class ExtractionClient implements IExtractionClient {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 通用文档转 Markdown(用于 .xlsx 等灰度支持格式)
|
||||
*/
|
||||
async convertToMarkdown(
|
||||
file: Buffer,
|
||||
filename: string
|
||||
): Promise<MarkdownExtractionResult> {
|
||||
try {
|
||||
const formData = new FormData();
|
||||
formData.append('file', file, filename);
|
||||
|
||||
const response = await axios.post<MarkdownExtractionResult>(
|
||||
`${this.baseUrl}/api/document/to-markdown`,
|
||||
formData,
|
||||
{
|
||||
headers: {
|
||||
...formData.getHeaders(),
|
||||
},
|
||||
timeout: 180000, // 3 分钟超时(Excel 大文件转换)
|
||||
}
|
||||
);
|
||||
|
||||
return response.data;
|
||||
} catch (error) {
|
||||
console.error('[ExtractionClient] Convert to markdown failed:', error);
|
||||
|
||||
if (axios.isAxiosError(error) && error.response) {
|
||||
throw new Error(`Document markdown conversion failed: ${error.response.data.detail || error.message}`);
|
||||
}
|
||||
|
||||
throw new Error('Document markdown conversion failed');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 🆕 数据侦探 API - 分析 Word 文档
|
||||
* 提取表格并进行数据验证(L1 算术 + L2 统计 + L2.5 一致性)
|
||||
|
||||
@@ -79,10 +79,15 @@ export async function uploadAttachment(
|
||||
stack: error instanceof Error ? error.stack : undefined,
|
||||
});
|
||||
|
||||
return reply.status(500).send({
|
||||
const isValidationError =
|
||||
errorMessage.includes('不支持') ||
|
||||
errorMessage.includes('文件类型') ||
|
||||
errorMessage.includes('请在 Word 中另存为 .docx');
|
||||
|
||||
return reply.status(isValidationError ? 400 : 500).send({
|
||||
code: -1,
|
||||
error: {
|
||||
code: 'INTERNAL_ERROR',
|
||||
code: isValidationError ? 'VALIDATION_ERROR' : 'INTERNAL_ERROR',
|
||||
message: errorMessage,
|
||||
},
|
||||
});
|
||||
|
||||
@@ -24,7 +24,7 @@ const ATTACHMENT_CACHE_TTL = 2 * 60 * 60; // 2小时
|
||||
|
||||
const MAX_ATTACHMENTS = 5;
|
||||
const MAX_TOKENS_PER_ATTACHMENT = 30000; // 单个附件最大 30k Token
|
||||
const ALLOWED_FILE_TYPES = ['pdf', 'docx', 'txt', 'xlsx', 'doc'];
|
||||
const ALLOWED_FILE_TYPES = ['pdf', 'docx', 'txt', 'xlsx', 'csv'];
|
||||
|
||||
interface AttachmentQueryScope {
|
||||
userId?: string;
|
||||
@@ -33,6 +33,7 @@ interface AttachmentQueryScope {
|
||||
|
||||
interface AiaAttachmentTextRecord {
|
||||
id: string;
|
||||
filename: string;
|
||||
textContent: string | null;
|
||||
extractStatus: 'success' | 'failed' | 'empty' | string;
|
||||
extractError: string | null;
|
||||
@@ -61,7 +62,13 @@ export async function uploadAttachment(
|
||||
// 1. 验证文件类型
|
||||
const ext = file.filename.split('.').pop()?.toLowerCase();
|
||||
if (!ext || !ALLOWED_FILE_TYPES.includes(ext)) {
|
||||
throw new Error(`不支持的文件类型: ${ext}。支持: ${ALLOWED_FILE_TYPES.join(', ')}`);
|
||||
if (ext === 'doc') {
|
||||
throw new Error('系统为了保证文档解析的精确度,当前仅支持 .docx 格式。请您在本地 Word 中打开该文件,选择“另存为 -> Word 文档 (.docx)”后再次上传。');
|
||||
}
|
||||
if (ext === 'xls') {
|
||||
throw new Error('系统为了保证表格解析的稳定性,当前仅支持 .xlsx 格式。请您在本地 Excel/WPS 中打开该文件,选择“另存为 -> Excel 工作簿 (.xlsx)”后再次上传。');
|
||||
}
|
||||
throw new Error(`不支持的文件类型: .${ext || 'unknown'}。当前支持:.pdf、.docx、.txt、.xlsx、.csv`);
|
||||
}
|
||||
|
||||
// 2. 上传到存储服务
|
||||
@@ -97,8 +104,20 @@ export async function uploadAttachment(
|
||||
let result;
|
||||
if (ext === 'pdf') {
|
||||
result = await extractionClient.extractPdf(file.buffer, file.filename);
|
||||
} else if (ext === 'docx' || ext === 'doc') {
|
||||
} else if (ext === 'docx') {
|
||||
result = await extractionClient.extractDocx(file.buffer, file.filename);
|
||||
} else if (ext === 'xlsx' || ext === 'csv') {
|
||||
const markdownResult = await extractionClient.convertToMarkdown(file.buffer, file.filename);
|
||||
result = {
|
||||
success: markdownResult.success,
|
||||
method: ext === 'csv' ? 'markdown-csv' : 'markdown-excel',
|
||||
text: markdownResult.text || '',
|
||||
metadata: {
|
||||
filename: file.filename,
|
||||
...(markdownResult.metadata || {}),
|
||||
},
|
||||
error: markdownResult.error,
|
||||
};
|
||||
} else {
|
||||
result = await extractionClient.extractDocument(file.buffer, file.filename);
|
||||
}
|
||||
@@ -253,6 +272,7 @@ export async function getAttachmentsText(
|
||||
where,
|
||||
select: {
|
||||
id: true,
|
||||
filename: true,
|
||||
textContent: true,
|
||||
extractStatus: true,
|
||||
extractError: true,
|
||||
@@ -265,29 +285,30 @@ export async function getAttachmentsText(
|
||||
try {
|
||||
const cacheKey = `${ATTACHMENT_CACHE_PREFIX}${attachmentId}`;
|
||||
const text = await cache.get<string>(cacheKey);
|
||||
const record = recordMap.get(attachmentId);
|
||||
const displayName = record?.filename || attachmentId;
|
||||
|
||||
if (text) {
|
||||
texts.push(`【附件: ${attachmentId}】\n${text}`);
|
||||
texts.push(`【附件: ${displayName}】\n${text}`);
|
||||
logger.debug('[AIA:AttachmentService] 从缓存获取附件文本成功', {
|
||||
attachmentId,
|
||||
textLength: text.length,
|
||||
});
|
||||
} else {
|
||||
const record = recordMap.get(attachmentId);
|
||||
logger.warn('[AIA:AttachmentService] 附件文本缓存未命中,尝试数据库回源', {
|
||||
attachmentId,
|
||||
hasDbRecord: !!record,
|
||||
});
|
||||
|
||||
if (record?.extractStatus === 'success' && record.textContent) {
|
||||
texts.push(`【附件: ${attachmentId}】\n${record.textContent}`);
|
||||
texts.push(`【附件: ${displayName}】\n${record.textContent}`);
|
||||
await cache.set(cacheKey, record.textContent, ATTACHMENT_CACHE_TTL);
|
||||
} else if (record?.extractStatus === 'failed') {
|
||||
texts.push(`【附件: ${attachmentId}】\n[附件内容提取失败:${record.extractError || '请重新上传附件'}]`);
|
||||
texts.push(`【附件: ${displayName}】\n[附件内容提取失败:${record.extractError || '请重新上传附件'}]`);
|
||||
} else if (record?.extractStatus === 'empty') {
|
||||
texts.push(`【附件: ${attachmentId}】\n[附件内容为空或无法提取有效文本]`);
|
||||
texts.push(`【附件: ${displayName}】\n[附件内容为空或无法提取有效文本]`);
|
||||
} else {
|
||||
texts.push(`【附件: ${attachmentId}】\n[附件内容不存在或未就绪]`);
|
||||
texts.push(`【附件: ${displayName}】\n[附件内容不存在或未就绪]`);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
|
||||
@@ -31,6 +31,11 @@ import type {
|
||||
const DEFAULT_MODEL = 'deepseek-v3';
|
||||
const MAX_CONTEXT_MESSAGES = 20;
|
||||
const MAX_CONTEXT_TOKENS = 8000;
|
||||
const ATTACHMENT_QA_GUARD_PROMPT = `当本轮对话包含“附件内容”时,你必须严格遵守:
|
||||
1) 仅基于本轮提供的“附件内容”和用户问题作答。
|
||||
2) 严禁引用或假设任何“系统知识库/预置知识文档/隐藏资料”内容。
|
||||
3) 若附件内容提取失败、为空或与问题无关,必须明确告知用户并请其重新上传可解析文件(建议 .pdf/.docx/.txt/.xlsx/.csv)。
|
||||
4) 禁止编造你“已阅读某知识库文档”或“基于后台知识库整理”。`;
|
||||
|
||||
// ==================== 对话管理 ====================
|
||||
|
||||
@@ -313,6 +318,11 @@ export async function sendMessageStream(
|
||||
if (attachmentIds && attachmentIds.length > 0) {
|
||||
const attachmentText = await getAttachmentText(attachmentIds, userId, conversationId);
|
||||
if (attachmentText) {
|
||||
// 附件问答加严护栏:禁止模型引用系统知识库或虚构背景文档
|
||||
contextMessages.push({
|
||||
role: 'system',
|
||||
content: ATTACHMENT_QA_GUARD_PROMPT,
|
||||
});
|
||||
userContent = `${content}\n\n---\n附件内容:\n${attachmentText}`;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -168,8 +168,10 @@ export const ATTACHMENT_CONFIG = {
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'text/plain',
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'text/csv',
|
||||
'application/csv',
|
||||
],
|
||||
supportedExtensions: ['pdf', 'docx', 'txt', 'xlsx'],
|
||||
supportedExtensions: ['pdf', 'docx', 'txt', 'xlsx', 'csv'],
|
||||
};
|
||||
|
||||
// ==================== API 响应格式 ====================
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
> **维护规则**: 每次修改 Schema / 新增依赖 / 改配置时,**立即**在此文档追加记录
|
||||
> **Cursor Rule**: `.cursor/rules/deployment-change-tracking.mdc` 会自动提醒
|
||||
> **最后清零**: 2026-03-09(0309 二次部署完成后清零)
|
||||
> **本次变更**: 用户直授权限体系 + 运营埋点增强 + 运营看板 MAU/Token(2026-03-10)
|
||||
> **本次变更**: 用户直授权限体系 + 运营埋点增强 + 运营看板 MAU/Token + AIA 附件格式优化(2026-03-10)
|
||||
|
||||
---
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
| BE-4 | 运营埋点覆盖 6 大模块 | `deepResearchController.ts`, `reviewController.ts`, `AIController.ts`, `StreamAIController.ts`, `SessionController.ts`, `QuickActionController.ts`, `iitBatchController.ts`, `auth.controller.ts`, `auth.routes.ts` | 重新构建镜像 | ASL/RVW/DC/IIT/AIA/SYSTEM 埋点;DC 模块覆盖上传/流式AI/快速操作/非流式AI 全部 4 个入口 + 前端通用上报接口 `POST /api/v1/auth/activity` |
|
||||
| BE-5 | 运营看板增强(MAU/Token/最活跃用户) | `activity.service.ts`, `statsController.ts` | 重新构建镜像 | `getTodayOverview` 新增 MAU、apiTokenTotal、topActiveUser |
|
||||
| BE-6 | 埋点验证脚本 | `scripts/verify-activity-tracking.ts` | 无需部署 | `npm run test:tracking` 开发/运维自测用 |
|
||||
| BE-7 | AIA 附件格式能力更新(支持 `.xlsx/.csv`,`.doc/.xls` 友好提示) | `modules/aia/services/attachmentService.ts`, `modules/aia/services/conversationService.ts`, `modules/aia/types/index.ts`, `common/document/ExtractionClient.ts` | 重新构建镜像 | `.xlsx/.csv` 走 `document/to-markdown` 灰度解析;统一附件问答护栏,防止引用系统知识库 |
|
||||
|
||||
### 前端变更
|
||||
|
||||
@@ -39,6 +40,7 @@
|
||||
| FE-3 | 用户管理 API 层新增权限接口 | `userApi.ts` | 重新构建镜像 | `getUserDirectPermissions` / `updateUserDirectPermissions` / `getPermissionOptions` |
|
||||
| FE-4 | 运营看板展示 MAU/Token/最活跃用户 | `StatsDashboardPage.tsx`, `statsApi.ts` | 重新构建镜像 | 新增 4 个统计卡片 |
|
||||
| FE-5 | 顶部导航点击埋点上报 | `TopNavigation.tsx` | 重新构建镜像 | 点击模块导航时 fire-and-forget 上报 |
|
||||
| FE-6 | AIA 上传交互更新(放开 `.xlsx/.csv`,`.doc/.xls` 友好提示) | `modules/aia/components/ChatWorkspace.tsx`, `modules/aia/constants.ts` | 重新构建镜像 | 上传白名单与提示文案同步后端,附件入口文案更新 |
|
||||
|
||||
### Python 微服务变更
|
||||
|
||||
|
||||
@@ -42,6 +42,7 @@ const getFileIcon = (filename: string): string => {
|
||||
docx: 'word',
|
||||
xls: 'excel',
|
||||
xlsx: 'excel',
|
||||
csv: 'excel',
|
||||
ppt: 'ppt',
|
||||
pptx: 'ppt',
|
||||
txt: 'default',
|
||||
@@ -421,11 +422,17 @@ export const ChatWorkspace: React.FC<ChatWorkspaceProps> = ({
|
||||
}
|
||||
|
||||
const file = files[0];
|
||||
const allowedTypes = ['.pdf', '.docx', '.doc', '.txt', '.xlsx'];
|
||||
const allowedTypes = ['.pdf', '.docx', '.txt', '.xlsx', '.csv'];
|
||||
const ext = '.' + file.name.split('.').pop()?.toLowerCase();
|
||||
|
||||
if (!allowedTypes.includes(ext)) {
|
||||
alert(`不支持的文件类型: ${ext}\n支持的类型: ${allowedTypes.join(', ')}`);
|
||||
if (ext === '.doc') {
|
||||
alert('系统为了保证文档解析的精确度,当前仅支持 .docx 格式。请您在本地 Word 中打开该文件,选择“另存为 -> Word 文档 (.docx)”后再次上传。');
|
||||
} else if (ext === '.xls') {
|
||||
alert('系统为了保证表格解析的稳定性,当前仅支持 .xlsx 格式。请您在本地 Excel/WPS 中打开该文件,选择“另存为 -> Excel 工作簿 (.xlsx)”后再次上传。');
|
||||
} else {
|
||||
alert(`不支持的文件类型: ${ext}\n当前支持: ${allowedTypes.join(', ')}`);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -453,7 +460,9 @@ export const ChatWorkspace: React.FC<ChatWorkspaceProps> = ({
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`上传失败: ${response.status}`);
|
||||
const errorJson = await response.json().catch(() => null);
|
||||
const backendMessage = errorJson?.error?.message;
|
||||
throw new Error(backendMessage || `上传失败: ${response.status}`);
|
||||
}
|
||||
|
||||
const result = await response.json();
|
||||
@@ -469,7 +478,8 @@ export const ChatWorkspace: React.FC<ChatWorkspaceProps> = ({
|
||||
console.error('[ChatWorkspace] 上传附件失败:', error);
|
||||
// 移除失败的附件
|
||||
setAttachments(prev => prev.filter(a => a.id !== tempId));
|
||||
alert('附件上传失败,请重试');
|
||||
const message = error instanceof Error ? error.message : '附件上传失败,请重试';
|
||||
alert(message);
|
||||
} finally {
|
||||
setIsUploading(false);
|
||||
// 清空 input,允许重复选择同一文件
|
||||
@@ -780,7 +790,7 @@ export const ChatWorkspace: React.FC<ChatWorkspaceProps> = ({
|
||||
className={`attachment-btn ${attachments.length > 0 ? 'has-attachments' : ''}`}
|
||||
onClick={handleAttachmentClick}
|
||||
disabled={isUploading}
|
||||
title="添加附件(PDF、Word、TXT、Excel)"
|
||||
title="添加附件(PDF、DOCX、TXT、XLSX、CSV)"
|
||||
>
|
||||
<Paperclip size={16} />
|
||||
{attachments.length > 0 && (
|
||||
@@ -818,7 +828,7 @@ export const ChatWorkspace: React.FC<ChatWorkspaceProps> = ({
|
||||
<input
|
||||
ref={fileInputRef}
|
||||
type="file"
|
||||
accept=".pdf,.docx,.doc,.txt,.xlsx"
|
||||
accept=".pdf,.docx,.txt,.xlsx,.xls,.csv"
|
||||
onChange={handleFileChange}
|
||||
style={{ display: 'none' }}
|
||||
/>
|
||||
|
||||
@@ -186,6 +186,7 @@ export const BRAND_COLORS = {
|
||||
teal: '#0D9488',
|
||||
purple: '#9333EA',
|
||||
yellow: '#CA8A04',
|
||||
indigo: '#6366F1',
|
||||
} as const;
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user