fix(aia): stabilize attachment parsing and expand spreadsheet support
Align AIA attachment handling with actual extractor capability by adding xlsx/csv gray support, guiding doc/xls users to convert formats, and enforcing attachment-only answering to prevent system knowledge leakage. Made-with: Cursor
This commit is contained in:
@@ -31,6 +31,19 @@ export interface ExtractionResult {
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export interface MarkdownExtractionResult {
|
||||
success: boolean;
|
||||
text?: string;
|
||||
format?: string;
|
||||
metadata?: {
|
||||
original_file_type?: string;
|
||||
char_count?: number;
|
||||
filename?: string;
|
||||
[key: string]: unknown;
|
||||
};
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* 数据侦探结果(Python 返回)
|
||||
*/
|
||||
@@ -95,6 +108,7 @@ export interface IExtractionClient {
|
||||
extractTxt(file: Buffer, filename: string): Promise<ExtractionResult>;
|
||||
detectLanguage(file: Buffer, filename: string): Promise<{ language: string; chinese_ratio: number; chinese_chars: number; total_chars: number }>;
|
||||
getPdfStrategy(file: Buffer, filename: string): Promise<{ detected_language: string; recommended_method: string; reason: string; nougat_available: boolean }>;
|
||||
convertToMarkdown(file: Buffer, filename: string): Promise<MarkdownExtractionResult>;
|
||||
analyzeDocx(filePath: string, config: ForensicsConfig): Promise<ForensicsResult>;
|
||||
}
|
||||
|
||||
@@ -332,6 +346,40 @@ class ExtractionClient implements IExtractionClient {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 通用文档转 Markdown(用于 .xlsx 等灰度支持格式)
|
||||
*/
|
||||
async convertToMarkdown(
|
||||
file: Buffer,
|
||||
filename: string
|
||||
): Promise<MarkdownExtractionResult> {
|
||||
try {
|
||||
const formData = new FormData();
|
||||
formData.append('file', file, filename);
|
||||
|
||||
const response = await axios.post<MarkdownExtractionResult>(
|
||||
`${this.baseUrl}/api/document/to-markdown`,
|
||||
formData,
|
||||
{
|
||||
headers: {
|
||||
...formData.getHeaders(),
|
||||
},
|
||||
timeout: 180000, // 3 分钟超时(Excel 大文件转换)
|
||||
}
|
||||
);
|
||||
|
||||
return response.data;
|
||||
} catch (error) {
|
||||
console.error('[ExtractionClient] Convert to markdown failed:', error);
|
||||
|
||||
if (axios.isAxiosError(error) && error.response) {
|
||||
throw new Error(`Document markdown conversion failed: ${error.response.data.detail || error.message}`);
|
||||
}
|
||||
|
||||
throw new Error('Document markdown conversion failed');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 🆕 数据侦探 API - 分析 Word 文档
|
||||
* 提取表格并进行数据验证(L1 算术 + L2 统计 + L2.5 一致性)
|
||||
|
||||
@@ -79,10 +79,15 @@ export async function uploadAttachment(
|
||||
stack: error instanceof Error ? error.stack : undefined,
|
||||
});
|
||||
|
||||
return reply.status(500).send({
|
||||
const isValidationError =
|
||||
errorMessage.includes('不支持') ||
|
||||
errorMessage.includes('文件类型') ||
|
||||
errorMessage.includes('请在 Word 中另存为 .docx');
|
||||
|
||||
return reply.status(isValidationError ? 400 : 500).send({
|
||||
code: -1,
|
||||
error: {
|
||||
code: 'INTERNAL_ERROR',
|
||||
code: isValidationError ? 'VALIDATION_ERROR' : 'INTERNAL_ERROR',
|
||||
message: errorMessage,
|
||||
},
|
||||
});
|
||||
|
||||
@@ -24,7 +24,7 @@ const ATTACHMENT_CACHE_TTL = 2 * 60 * 60; // 2小时
|
||||
|
||||
const MAX_ATTACHMENTS = 5;
|
||||
const MAX_TOKENS_PER_ATTACHMENT = 30000; // 单个附件最大 30k Token
|
||||
const ALLOWED_FILE_TYPES = ['pdf', 'docx', 'txt', 'xlsx', 'doc'];
|
||||
const ALLOWED_FILE_TYPES = ['pdf', 'docx', 'txt', 'xlsx', 'csv'];
|
||||
|
||||
interface AttachmentQueryScope {
|
||||
userId?: string;
|
||||
@@ -33,6 +33,7 @@ interface AttachmentQueryScope {
|
||||
|
||||
interface AiaAttachmentTextRecord {
|
||||
id: string;
|
||||
filename: string;
|
||||
textContent: string | null;
|
||||
extractStatus: 'success' | 'failed' | 'empty' | string;
|
||||
extractError: string | null;
|
||||
@@ -61,7 +62,13 @@ export async function uploadAttachment(
|
||||
// 1. 验证文件类型
|
||||
const ext = file.filename.split('.').pop()?.toLowerCase();
|
||||
if (!ext || !ALLOWED_FILE_TYPES.includes(ext)) {
|
||||
throw new Error(`不支持的文件类型: ${ext}。支持: ${ALLOWED_FILE_TYPES.join(', ')}`);
|
||||
if (ext === 'doc') {
|
||||
throw new Error('系统为了保证文档解析的精确度,当前仅支持 .docx 格式。请您在本地 Word 中打开该文件,选择“另存为 -> Word 文档 (.docx)”后再次上传。');
|
||||
}
|
||||
if (ext === 'xls') {
|
||||
throw new Error('系统为了保证表格解析的稳定性,当前仅支持 .xlsx 格式。请您在本地 Excel/WPS 中打开该文件,选择“另存为 -> Excel 工作簿 (.xlsx)”后再次上传。');
|
||||
}
|
||||
throw new Error(`不支持的文件类型: .${ext || 'unknown'}。当前支持:.pdf、.docx、.txt、.xlsx、.csv`);
|
||||
}
|
||||
|
||||
// 2. 上传到存储服务
|
||||
@@ -97,8 +104,20 @@ export async function uploadAttachment(
|
||||
let result;
|
||||
if (ext === 'pdf') {
|
||||
result = await extractionClient.extractPdf(file.buffer, file.filename);
|
||||
} else if (ext === 'docx' || ext === 'doc') {
|
||||
} else if (ext === 'docx') {
|
||||
result = await extractionClient.extractDocx(file.buffer, file.filename);
|
||||
} else if (ext === 'xlsx' || ext === 'csv') {
|
||||
const markdownResult = await extractionClient.convertToMarkdown(file.buffer, file.filename);
|
||||
result = {
|
||||
success: markdownResult.success,
|
||||
method: ext === 'csv' ? 'markdown-csv' : 'markdown-excel',
|
||||
text: markdownResult.text || '',
|
||||
metadata: {
|
||||
filename: file.filename,
|
||||
...(markdownResult.metadata || {}),
|
||||
},
|
||||
error: markdownResult.error,
|
||||
};
|
||||
} else {
|
||||
result = await extractionClient.extractDocument(file.buffer, file.filename);
|
||||
}
|
||||
@@ -253,6 +272,7 @@ export async function getAttachmentsText(
|
||||
where,
|
||||
select: {
|
||||
id: true,
|
||||
filename: true,
|
||||
textContent: true,
|
||||
extractStatus: true,
|
||||
extractError: true,
|
||||
@@ -265,29 +285,30 @@ export async function getAttachmentsText(
|
||||
try {
|
||||
const cacheKey = `${ATTACHMENT_CACHE_PREFIX}${attachmentId}`;
|
||||
const text = await cache.get<string>(cacheKey);
|
||||
const record = recordMap.get(attachmentId);
|
||||
const displayName = record?.filename || attachmentId;
|
||||
|
||||
if (text) {
|
||||
texts.push(`【附件: ${attachmentId}】\n${text}`);
|
||||
texts.push(`【附件: ${displayName}】\n${text}`);
|
||||
logger.debug('[AIA:AttachmentService] 从缓存获取附件文本成功', {
|
||||
attachmentId,
|
||||
textLength: text.length,
|
||||
});
|
||||
} else {
|
||||
const record = recordMap.get(attachmentId);
|
||||
logger.warn('[AIA:AttachmentService] 附件文本缓存未命中,尝试数据库回源', {
|
||||
attachmentId,
|
||||
hasDbRecord: !!record,
|
||||
});
|
||||
|
||||
if (record?.extractStatus === 'success' && record.textContent) {
|
||||
texts.push(`【附件: ${attachmentId}】\n${record.textContent}`);
|
||||
texts.push(`【附件: ${displayName}】\n${record.textContent}`);
|
||||
await cache.set(cacheKey, record.textContent, ATTACHMENT_CACHE_TTL);
|
||||
} else if (record?.extractStatus === 'failed') {
|
||||
texts.push(`【附件: ${attachmentId}】\n[附件内容提取失败:${record.extractError || '请重新上传附件'}]`);
|
||||
texts.push(`【附件: ${displayName}】\n[附件内容提取失败:${record.extractError || '请重新上传附件'}]`);
|
||||
} else if (record?.extractStatus === 'empty') {
|
||||
texts.push(`【附件: ${attachmentId}】\n[附件内容为空或无法提取有效文本]`);
|
||||
texts.push(`【附件: ${displayName}】\n[附件内容为空或无法提取有效文本]`);
|
||||
} else {
|
||||
texts.push(`【附件: ${attachmentId}】\n[附件内容不存在或未就绪]`);
|
||||
texts.push(`【附件: ${displayName}】\n[附件内容不存在或未就绪]`);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
|
||||
@@ -31,6 +31,11 @@ import type {
|
||||
const DEFAULT_MODEL = 'deepseek-v3';
|
||||
const MAX_CONTEXT_MESSAGES = 20;
|
||||
const MAX_CONTEXT_TOKENS = 8000;
|
||||
const ATTACHMENT_QA_GUARD_PROMPT = `当本轮对话包含“附件内容”时,你必须严格遵守:
|
||||
1) 仅基于本轮提供的“附件内容”和用户问题作答。
|
||||
2) 严禁引用或假设任何“系统知识库/预置知识文档/隐藏资料”内容。
|
||||
3) 若附件内容提取失败、为空或与问题无关,必须明确告知用户并请其重新上传可解析文件(建议 .pdf/.docx/.txt/.xlsx/.csv)。
|
||||
4) 禁止编造你“已阅读某知识库文档”或“基于后台知识库整理”。`;
|
||||
|
||||
// ==================== 对话管理 ====================
|
||||
|
||||
@@ -313,6 +318,11 @@ export async function sendMessageStream(
|
||||
if (attachmentIds && attachmentIds.length > 0) {
|
||||
const attachmentText = await getAttachmentText(attachmentIds, userId, conversationId);
|
||||
if (attachmentText) {
|
||||
// 附件问答加严护栏:禁止模型引用系统知识库或虚构背景文档
|
||||
contextMessages.push({
|
||||
role: 'system',
|
||||
content: ATTACHMENT_QA_GUARD_PROMPT,
|
||||
});
|
||||
userContent = `${content}\n\n---\n附件内容:\n${attachmentText}`;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -168,8 +168,10 @@ export const ATTACHMENT_CONFIG = {
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'text/plain',
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'text/csv',
|
||||
'application/csv',
|
||||
],
|
||||
supportedExtensions: ['pdf', 'docx', 'txt', 'xlsx'],
|
||||
supportedExtensions: ['pdf', 'docx', 'txt', 'xlsx', 'csv'],
|
||||
};
|
||||
|
||||
// ==================== API 响应格式 ====================
|
||||
|
||||
Reference in New Issue
Block a user