feat(rvw): implement Skills architecture (Day 7-10)
- Add Skills core framework (types, registry, executor, profile, context) - Implement DataForensicsSkill with DI, path security, graceful degradation - Implement EditorialSkill and MethodologySkill wrapping existing services - Extend ExtractionClient with IExtractionClient interface and analyzeDocx - Refactor reviewWorker to support V1/V2 architecture switching - Add Zod config validation and generic type support - Update development docs and module status Day 7: Skills core framework (~700 lines) Day 8: DataForensicsSkill + ExtractionClient extension (~400 lines) Day 9: EditorialSkill + MethodologySkill (~350 lines) Day 10: ReviewWorker integration (~280 lines) Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -1,9 +1,13 @@
|
||||
import FormData from 'form-data';
|
||||
import axios from 'axios';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
/**
|
||||
* Extraction Service Client
|
||||
* 调用Python微服务进行文档提取
|
||||
*
|
||||
* @version 2.0.0 - 新增数据侦探 API (analyzeDocx)
|
||||
*/
|
||||
|
||||
const EXTRACTION_SERVICE_URL = process.env.EXTRACTION_SERVICE_URL || 'http://localhost:8000';
|
||||
@@ -22,12 +26,78 @@ export interface ExtractionResult {
|
||||
file_size?: number;
|
||||
page_count?: number;
|
||||
has_tables?: boolean;
|
||||
[key: string]: any;
|
||||
[key: string]: unknown;
|
||||
};
|
||||
error?: string;
|
||||
}
|
||||
|
||||
class ExtractionClient {
|
||||
/**
|
||||
* 数据侦探结果(Python 返回)
|
||||
*/
|
||||
export interface ForensicsResult {
|
||||
tables: ForensicsTable[];
|
||||
methods: string[];
|
||||
issues: ForensicsIssue[];
|
||||
summary: {
|
||||
totalTables: number;
|
||||
totalIssues: number;
|
||||
errorCount: number;
|
||||
warningCount: number;
|
||||
};
|
||||
}
|
||||
|
||||
export interface ForensicsTable {
|
||||
id: string;
|
||||
caption: string;
|
||||
data: string[][];
|
||||
html?: string;
|
||||
headers?: string[];
|
||||
rowCount: number;
|
||||
colCount: number;
|
||||
}
|
||||
|
||||
export interface ForensicsIssue {
|
||||
severity: 'ERROR' | 'WARNING' | 'INFO';
|
||||
type: string;
|
||||
message: string;
|
||||
location?: {
|
||||
tableId?: string;
|
||||
cellRef?: string;
|
||||
paragraph?: number;
|
||||
lineRange?: [number, number];
|
||||
};
|
||||
evidence?: {
|
||||
expected?: string | number;
|
||||
actual?: string | number;
|
||||
formula?: string;
|
||||
[key: string]: unknown;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* 数据侦探配置
|
||||
*/
|
||||
export interface ForensicsConfig {
|
||||
checkLevel: 'L1' | 'L1_L2' | 'L1_L2_L25';
|
||||
tolerancePercent: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* IExtractionClient 接口
|
||||
* 用于依赖注入,便于测试 Mock
|
||||
*/
|
||||
export interface IExtractionClient {
|
||||
health(): Promise<{ status: string; checks: unknown; timestamp: string }>;
|
||||
extractDocument(file: Buffer, filename: string): Promise<ExtractionResult>;
|
||||
extractPdf(file: Buffer, filename: string, method?: 'auto' | 'nougat' | 'pymupdf'): Promise<ExtractionResult>;
|
||||
extractDocx(file: Buffer, filename: string): Promise<ExtractionResult>;
|
||||
extractTxt(file: Buffer, filename: string): Promise<ExtractionResult>;
|
||||
detectLanguage(file: Buffer, filename: string): Promise<{ language: string; chinese_ratio: number; chinese_chars: number; total_chars: number }>;
|
||||
getPdfStrategy(file: Buffer, filename: string): Promise<{ detected_language: string; recommended_method: string; reason: string; nougat_available: boolean }>;
|
||||
analyzeDocx(filePath: string, config: ForensicsConfig): Promise<ForensicsResult>;
|
||||
}
|
||||
|
||||
class ExtractionClient implements IExtractionClient {
|
||||
private baseUrl: string;
|
||||
|
||||
constructor(baseUrl: string = EXTRACTION_SERVICE_URL) {
|
||||
@@ -260,6 +330,63 @@ class ExtractionClient {
|
||||
throw new Error('Get PDF strategy failed');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 🆕 数据侦探 API - 分析 Word 文档
|
||||
* 提取表格并进行数据验证(L1 算术 + L2 统计 + L2.5 一致性)
|
||||
*
|
||||
* @param filePath 文件路径(服务端路径)
|
||||
* @param config 侦探配置
|
||||
* @returns 侦探结果
|
||||
*/
|
||||
async analyzeDocx(
|
||||
filePath: string,
|
||||
config: ForensicsConfig
|
||||
): Promise<ForensicsResult> {
|
||||
try {
|
||||
// 读取文件
|
||||
const file = fs.readFileSync(filePath);
|
||||
const filename = path.basename(filePath);
|
||||
|
||||
const formData = new FormData();
|
||||
formData.append('file', file, filename);
|
||||
formData.append('check_level', config.checkLevel);
|
||||
formData.append('tolerance_percent', config.tolerancePercent.toString());
|
||||
|
||||
const response = await axios.post<ForensicsResult>(
|
||||
`${this.baseUrl}/api/v1/forensics/analyze`,
|
||||
formData,
|
||||
{
|
||||
headers: {
|
||||
...formData.getHeaders(),
|
||||
},
|
||||
timeout: 60000, // 60 秒超时
|
||||
}
|
||||
);
|
||||
|
||||
return response.data;
|
||||
} catch (error) {
|
||||
console.error('[ExtractionClient] Forensics analysis failed:', error);
|
||||
|
||||
if (axios.isAxiosError(error)) {
|
||||
if (error.code === 'ECONNREFUSED') {
|
||||
const err = new Error('Forensics service unavailable');
|
||||
(err as NodeJS.ErrnoException).code = 'ECONNREFUSED';
|
||||
throw err;
|
||||
}
|
||||
if (error.code === 'ETIMEDOUT') {
|
||||
const err = new Error('Forensics service timeout');
|
||||
(err as NodeJS.ErrnoException).code = 'ETIMEDOUT';
|
||||
throw err;
|
||||
}
|
||||
if (error.response) {
|
||||
throw new Error(`Forensics analysis failed: ${error.response.data.detail || error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error('Forensics analysis failed');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 导出类和单例
|
||||
|
||||
Reference in New Issue
Block a user