feat(rvw): implement Skills architecture (Day 7-10)

- Add Skills core framework (types, registry, executor, profile, context) - Implement DataForensicsSkill with DI, path security, graceful degradation - Implement EditorialSkill and MethodologySkill wrapping existing services - Extend ExtractionClient with IExtractionClient interface and analyzeDocx - Refactor reviewWorker to support V1/V2 architecture switching - Add Zod config validation and generic type support - Update development docs and module status Day 7: Skills core framework (~700 lines) Day 8: DataForensicsSkill + ExtractionClient extension (~400 lines) Day 9: EditorialSkill + MethodologySkill (~350 lines) Day 10: ReviewWorker integration (~280 lines) Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-18 10:09:40 +08:00
parent e785969e54
commit 9f256c4a02
20 changed files with 5603 additions and 72 deletions
--- a/backend/src/common/document/ExtractionClient.ts
+++ b/backend/src/common/document/ExtractionClient.ts
@@ -1,9 +1,13 @@
 import FormData from 'form-data';
 import axios from 'axios';
+import * as fs from 'fs';
+import * as path from 'path';

 /**
 * Extraction Service Client
 * 调用Python微服务进行文档提取
+ * 
+ * @version 2.0.0 - 新增数据侦探 API (analyzeDocx)
 */

 const EXTRACTION_SERVICE_URL = process.env.EXTRACTION_SERVICE_URL || 'http://localhost:8000';
@@ -22,12 +26,78 @@ export interface ExtractionResult {
    file_size?: number;
    page_count?: number;
    has_tables?: boolean;
-    [key: string]: any;
+    [key: string]: unknown;
  };
  error?: string;
 }

-class ExtractionClient {
+/**
+ * 数据侦探结果（Python 返回）
+ */
+export interface ForensicsResult {
+  tables: ForensicsTable[];
+  methods: string[];
+  issues: ForensicsIssue[];
+  summary: {
+    totalTables: number;
+    totalIssues: number;
+    errorCount: number;
+    warningCount: number;
+  };
+}
+
+export interface ForensicsTable {
+  id: string;
+  caption: string;
+  data: string[][];
+  html?: string;
+  headers?: string[];
+  rowCount: number;
+  colCount: number;
+}
+
+export interface ForensicsIssue {
+  severity: 'ERROR' | 'WARNING' | 'INFO';
+  type: string;
+  message: string;
+  location?: {
+    tableId?: string;
+    cellRef?: string;
+    paragraph?: number;
+    lineRange?: [number, number];
+  };
+  evidence?: {
+    expected?: string | number;
+    actual?: string | number;
+    formula?: string;
+    [key: string]: unknown;
+  };
+}
+
+/**
+ * 数据侦探配置
+ */
+export interface ForensicsConfig {
+  checkLevel: 'L1' | 'L1_L2' | 'L1_L2_L25';
+  tolerancePercent: number;
+}
+
+/**
+ * IExtractionClient 接口
+ * 用于依赖注入，便于测试 Mock
+ */
+export interface IExtractionClient {
+  health(): Promise<{ status: string; checks: unknown; timestamp: string }>;
+  extractDocument(file: Buffer, filename: string): Promise<ExtractionResult>;
+  extractPdf(file: Buffer, filename: string, method?: 'auto' | 'nougat' | 'pymupdf'): Promise<ExtractionResult>;
+  extractDocx(file: Buffer, filename: string): Promise<ExtractionResult>;
+  extractTxt(file: Buffer, filename: string): Promise<ExtractionResult>;
+  detectLanguage(file: Buffer, filename: string): Promise<{ language: string; chinese_ratio: number; chinese_chars: number; total_chars: number }>;
+  getPdfStrategy(file: Buffer, filename: string): Promise<{ detected_language: string; recommended_method: string; reason: string; nougat_available: boolean }>;
+  analyzeDocx(filePath: string, config: ForensicsConfig): Promise<ForensicsResult>;
+}
+
+class ExtractionClient implements IExtractionClient {
  private baseUrl: string;

  constructor(baseUrl: string = EXTRACTION_SERVICE_URL) {
@@ -260,6 +330,63 @@ class ExtractionClient {
      throw new Error('Get PDF strategy failed');
    }
  }
+
+  /**
+   * 🆕 数据侦探 API - 分析 Word 文档
+   * 提取表格并进行数据验证（L1 算术 + L2 统计 + L2.5 一致性）
+   * 
+   * @param filePath 文件路径（服务端路径）
+   * @param config 侦探配置
+   * @returns 侦探结果
+   */
+  async analyzeDocx(
+    filePath: string,
+    config: ForensicsConfig
+  ): Promise<ForensicsResult> {
+    try {
+      // 读取文件
+      const file = fs.readFileSync(filePath);
+      const filename = path.basename(filePath);
+
+      const formData = new FormData();
+      formData.append('file', file, filename);
+      formData.append('check_level', config.checkLevel);
+      formData.append('tolerance_percent', config.tolerancePercent.toString());
+
+      const response = await axios.post<ForensicsResult>(
+        `${this.baseUrl}/api/v1/forensics/analyze`,
+        formData,
+        {
+          headers: {
+            ...formData.getHeaders(),
+          },
+          timeout: 60000, // 60 秒超时
+        }
+      );
+
+      return response.data;
+    } catch (error) {
+      console.error('[ExtractionClient] Forensics analysis failed:', error);
+
+      if (axios.isAxiosError(error)) {
+        if (error.code === 'ECONNREFUSED') {
+          const err = new Error('Forensics service unavailable');
+          (err as NodeJS.ErrnoException).code = 'ECONNREFUSED';
+          throw err;
+        }
+        if (error.code === 'ETIMEDOUT') {
+          const err = new Error('Forensics service timeout');
+          (err as NodeJS.ErrnoException).code = 'ETIMEDOUT';
+          throw err;
+        }
+        if (error.response) {
+          throw new Error(`Forensics analysis failed: ${error.response.data.detail || error.message}`);
+        }
+      }
+
+      throw new Error('Forensics analysis failed');
+    }
+  }
 }

 // 导出类和单例