fix(aia): stabilize attachment parsing and expand spreadsheet support

Align AIA attachment handling with actual extractor capability by adding xlsx/csv gray support, guiding doc/xls users to convert formats, and enforcing attachment-only answering to prevent system knowledge leakage.

Made-with: Cursor
This commit is contained in:
2026-03-10 13:15:36 +08:00
parent 097e7920ab
commit d96cdf3fe8
8 changed files with 118 additions and 19 deletions

View File

@@ -31,6 +31,19 @@ export interface ExtractionResult {
error?: string;
}
export interface MarkdownExtractionResult {
success: boolean;
text?: string;
format?: string;
metadata?: {
original_file_type?: string;
char_count?: number;
filename?: string;
[key: string]: unknown;
};
error?: string;
}
/**
* 数据侦探结果Python 返回)
*/
@@ -95,6 +108,7 @@ export interface IExtractionClient {
extractTxt(file: Buffer, filename: string): Promise<ExtractionResult>;
detectLanguage(file: Buffer, filename: string): Promise<{ language: string; chinese_ratio: number; chinese_chars: number; total_chars: number }>;
getPdfStrategy(file: Buffer, filename: string): Promise<{ detected_language: string; recommended_method: string; reason: string; nougat_available: boolean }>;
convertToMarkdown(file: Buffer, filename: string): Promise<MarkdownExtractionResult>;
analyzeDocx(filePath: string, config: ForensicsConfig): Promise<ForensicsResult>;
}
@@ -332,6 +346,40 @@ class ExtractionClient implements IExtractionClient {
}
}
/**
* 通用文档转 Markdown用于 .xlsx 等灰度支持格式)
*/
async convertToMarkdown(
file: Buffer,
filename: string
): Promise<MarkdownExtractionResult> {
try {
const formData = new FormData();
formData.append('file', file, filename);
const response = await axios.post<MarkdownExtractionResult>(
`${this.baseUrl}/api/document/to-markdown`,
formData,
{
headers: {
...formData.getHeaders(),
},
timeout: 180000, // 3 分钟超时Excel 大文件转换)
}
);
return response.data;
} catch (error) {
console.error('[ExtractionClient] Convert to markdown failed:', error);
if (axios.isAxiosError(error) && error.response) {
throw new Error(`Document markdown conversion failed: ${error.response.data.detail || error.message}`);
}
throw new Error('Document markdown conversion failed');
}
}
/**
* 🆕 数据侦探 API - 分析 Word 文档
* 提取表格并进行数据验证L1 算术 + L2 统计 + L2.5 一致性)

View File

@@ -79,10 +79,15 @@ export async function uploadAttachment(
stack: error instanceof Error ? error.stack : undefined,
});
return reply.status(500).send({
const isValidationError =
errorMessage.includes('不支持') ||
errorMessage.includes('文件类型') ||
errorMessage.includes('请在 Word 中另存为 .docx');
return reply.status(isValidationError ? 400 : 500).send({
code: -1,
error: {
code: 'INTERNAL_ERROR',
code: isValidationError ? 'VALIDATION_ERROR' : 'INTERNAL_ERROR',
message: errorMessage,
},
});

View File

@@ -24,7 +24,7 @@ const ATTACHMENT_CACHE_TTL = 2 * 60 * 60; // 2小时
const MAX_ATTACHMENTS = 5;
const MAX_TOKENS_PER_ATTACHMENT = 30000; // 单个附件最大 30k Token
const ALLOWED_FILE_TYPES = ['pdf', 'docx', 'txt', 'xlsx', 'doc'];
const ALLOWED_FILE_TYPES = ['pdf', 'docx', 'txt', 'xlsx', 'csv'];
interface AttachmentQueryScope {
userId?: string;
@@ -33,6 +33,7 @@ interface AttachmentQueryScope {
interface AiaAttachmentTextRecord {
id: string;
filename: string;
textContent: string | null;
extractStatus: 'success' | 'failed' | 'empty' | string;
extractError: string | null;
@@ -61,7 +62,13 @@ export async function uploadAttachment(
// 1. 验证文件类型
const ext = file.filename.split('.').pop()?.toLowerCase();
if (!ext || !ALLOWED_FILE_TYPES.includes(ext)) {
throw new Error(`不支持的文件类型: ${ext}。支持: ${ALLOWED_FILE_TYPES.join(', ')}`);
if (ext === 'doc') {
throw new Error('系统为了保证文档解析的精确度,当前仅支持 .docx 格式。请您在本地 Word 中打开该文件,选择“另存为 -> Word 文档 (.docx)”后再次上传。');
}
if (ext === 'xls') {
throw new Error('系统为了保证表格解析的稳定性,当前仅支持 .xlsx 格式。请您在本地 Excel/WPS 中打开该文件,选择“另存为 -> Excel 工作簿 (.xlsx)”后再次上传。');
}
throw new Error(`不支持的文件类型: .${ext || 'unknown'}。当前支持:.pdf、.docx、.txt、.xlsx、.csv`);
}
// 2. 上传到存储服务
@@ -97,8 +104,20 @@ export async function uploadAttachment(
let result;
if (ext === 'pdf') {
result = await extractionClient.extractPdf(file.buffer, file.filename);
} else if (ext === 'docx' || ext === 'doc') {
} else if (ext === 'docx') {
result = await extractionClient.extractDocx(file.buffer, file.filename);
} else if (ext === 'xlsx' || ext === 'csv') {
const markdownResult = await extractionClient.convertToMarkdown(file.buffer, file.filename);
result = {
success: markdownResult.success,
method: ext === 'csv' ? 'markdown-csv' : 'markdown-excel',
text: markdownResult.text || '',
metadata: {
filename: file.filename,
...(markdownResult.metadata || {}),
},
error: markdownResult.error,
};
} else {
result = await extractionClient.extractDocument(file.buffer, file.filename);
}
@@ -253,6 +272,7 @@ export async function getAttachmentsText(
where,
select: {
id: true,
filename: true,
textContent: true,
extractStatus: true,
extractError: true,
@@ -265,29 +285,30 @@ export async function getAttachmentsText(
try {
const cacheKey = `${ATTACHMENT_CACHE_PREFIX}${attachmentId}`;
const text = await cache.get<string>(cacheKey);
const record = recordMap.get(attachmentId);
const displayName = record?.filename || attachmentId;
if (text) {
texts.push(`【附件: ${attachmentId}\n${text}`);
texts.push(`【附件: ${displayName}\n${text}`);
logger.debug('[AIA:AttachmentService] 从缓存获取附件文本成功', {
attachmentId,
textLength: text.length,
});
} else {
const record = recordMap.get(attachmentId);
logger.warn('[AIA:AttachmentService] 附件文本缓存未命中,尝试数据库回源', {
attachmentId,
hasDbRecord: !!record,
});
if (record?.extractStatus === 'success' && record.textContent) {
texts.push(`【附件: ${attachmentId}\n${record.textContent}`);
texts.push(`【附件: ${displayName}\n${record.textContent}`);
await cache.set(cacheKey, record.textContent, ATTACHMENT_CACHE_TTL);
} else if (record?.extractStatus === 'failed') {
texts.push(`【附件: ${attachmentId}\n[附件内容提取失败:${record.extractError || '请重新上传附件'}]`);
texts.push(`【附件: ${displayName}\n[附件内容提取失败:${record.extractError || '请重新上传附件'}]`);
} else if (record?.extractStatus === 'empty') {
texts.push(`【附件: ${attachmentId}\n[附件内容为空或无法提取有效文本]`);
texts.push(`【附件: ${displayName}\n[附件内容为空或无法提取有效文本]`);
} else {
texts.push(`【附件: ${attachmentId}\n[附件内容不存在或未就绪]`);
texts.push(`【附件: ${displayName}\n[附件内容不存在或未就绪]`);
}
}
} catch (error) {

View File

@@ -31,6 +31,11 @@ import type {
const DEFAULT_MODEL = 'deepseek-v3';
const MAX_CONTEXT_MESSAGES = 20;
const MAX_CONTEXT_TOKENS = 8000;
const ATTACHMENT_QA_GUARD_PROMPT = `当本轮对话包含“附件内容”时,你必须严格遵守:
1) 仅基于本轮提供的“附件内容”和用户问题作答。
2) 严禁引用或假设任何“系统知识库/预置知识文档/隐藏资料”内容。
3) 若附件内容提取失败、为空或与问题无关,必须明确告知用户并请其重新上传可解析文件(建议 .pdf/.docx/.txt/.xlsx/.csv
4) 禁止编造你“已阅读某知识库文档”或“基于后台知识库整理”。`;
// ==================== 对话管理 ====================
@@ -313,6 +318,11 @@ export async function sendMessageStream(
if (attachmentIds && attachmentIds.length > 0) {
const attachmentText = await getAttachmentText(attachmentIds, userId, conversationId);
if (attachmentText) {
// 附件问答加严护栏:禁止模型引用系统知识库或虚构背景文档
contextMessages.push({
role: 'system',
content: ATTACHMENT_QA_GUARD_PROMPT,
});
userContent = `${content}\n\n---\n附件内容\n${attachmentText}`;
}
}

View File

@@ -168,8 +168,10 @@ export const ATTACHMENT_CONFIG = {
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'text/plain',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'text/csv',
'application/csv',
],
supportedExtensions: ['pdf', 'docx', 'txt', 'xlsx'],
supportedExtensions: ['pdf', 'docx', 'txt', 'xlsx', 'csv'],
};
// ==================== API 响应格式 ====================