feat(asl): Complete Deep Research V2.0 core development

Backend: - Add SSE streaming client (unifuncsSseClient) replacing async polling - Add paragraph-based reasoning parser with mergeConsecutiveThinking - Add requirement expansion service (DeepSeek-V3 PICOS+MeSH) - Add Word export service with Pandoc, inline hyperlinks, reference link expansion - Add deep research V2 worker with 2s log flush and Chinese source prompt - Add 5 curated data sources config (PubMed/ClinicalTrials/Cochrane/CNKI/MedJournals) - Add 4 API endpoints (generate-requirement/tasks/task-status/export-word) - Update Prisma schema with 6 new V2.0 fields on AslResearchTask - Add DB migration for V2.0 fields - Simplify ASL_DEEP_RESEARCH_EXPANSION prompt (remove strategy section) Frontend: - Add waterfall-flow DeepResearchPage (phase 0-4 progressive reveal) - Add LandingView, SetupPanel, StrategyConfirm, AgentTerminal, ResultsView - Add react-markdown + remark-gfm for report rendering - Add custom link component showing visible URLs after references - Add useDeepResearchTask polling hook - Add deep research TypeScript types Tests: - Add E2E test, smoke test, and Chinese data source test scripts Docs: - Update ASL module status (v2.0 - core features complete) - Update system status (v6.1 - ASL V2.0 milestone) - Update Unifuncs DeepSearch API guide (v2.0 - SSE mode + Chinese source results) - Update module auth specification (test script guidelines) - Update V2.0 development plan Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-23 13:21:52 +08:00
parent b06daecacd
commit 8f06d4f929
39 changed files with 5605 additions and 417 deletions
--- a/backend/src/modules/asl/utils/reasoningParser.ts
+++ b/backend/src/modules/asl/utils/reasoningParser.ts
@@ -0,0 +1,115 @@
+/**
+ * Reasoning Content 解析器
+ *
+ * 将 Unifuncs 返回的 reasoning_content 增量文本解析为结构化日志条目。
+ *
+ * 核心策略：按段落（\n\n）拆分，同一段落内的思考内容合并为一条日志，
+ * 只有 搜索/阅读/分析 等动作才单独成条。
+ */
+
+import { logger } from '../../../common/logging/index.js';
+
+export interface ExecutionLogEntry {
+  type: 'thinking' | 'searching' | 'reading' | 'analyzing' | 'summary' | 'info';
+  title: string;
+  text: string;
+  ts: string;
+}
+
+const SEARCH_PATTERN = /(?:搜索|searching|search(?:ing)?\s+for|查找|检索|looking\s+for)[：:\s]+(.+)/i;
+const READ_PATTERN = /(?:阅读|reading|read(?:ing)?|访问|打开|visiting|open(?:ing)?)\s*[：:\s]*(https?:\/\/\S+|\S+\.(?:com|org|net|gov|cn)\S*)/i;
+const ANALYZE_PATTERN = /(?:分析|analyz|发现|总结|归纳|结论|found|result|finding|conclud|summariz)/i;
+
+/**
+ * 将增量 reasoning 文本解析为段落级日志条目。
+ * 连续的思考行合并为一段，动作行（搜索/阅读/分析）独立成条。
+ */
+export function parseReasoningIncrement(
+  newText: string,
+  _previousLength: number
+): ExecutionLogEntry[] {
+  if (!newText) return [];
+
+  const entries: ExecutionLogEntry[] = [];
+  const now = new Date().toISOString();
+
+  const paragraphs = newText.split(/\n{2,}/);
+
+  for (const para of paragraphs) {
+    const lines = para.split('\n').filter(l => l.trim());
+    if (lines.length === 0) continue;
+
+    let thinkingBuf: string[] = [];
+
+    const flushThinking = () => {
+      if (thinkingBuf.length === 0) return;
+      const text = thinkingBuf.join('').slice(0, 800);
+      if (text.length > 10) {
+        entries.push({ type: 'thinking', title: '思考', text, ts: now });
+      }
+      thinkingBuf = [];
+    };
+
+    for (const line of lines) {
+      const trimmed = line.trim();
+      if (!trimmed) continue;
+
+      const searchMatch = trimmed.match(SEARCH_PATTERN);
+      if (searchMatch) {
+        flushThinking();
+        entries.push({ type: 'searching', title: '搜索', text: searchMatch[1].trim(), ts: now });
+        continue;
+      }
+
+      const readMatch = trimmed.match(READ_PATTERN);
+      if (readMatch) {
+        flushThinking();
+        entries.push({ type: 'reading', title: '阅读页面', text: readMatch[1].trim(), ts: now });
+        continue;
+      }
+
+      if (ANALYZE_PATTERN.test(trimmed) && trimmed.length > 20) {
+        flushThinking();
+        entries.push({ type: 'analyzing', title: '分析', text: trimmed.slice(0, 500), ts: now });
+        continue;
+      }
+
+      thinkingBuf.push(trimmed);
+    }
+
+    flushThinking();
+  }
+
+  return entries;
+}
+
+/**
+ * 合并连续的同类型（thinking）条目为一段。
+ * 在 Worker 写入 DB 前调用，减少碎片化。
+ */
+export function mergeConsecutiveThinking(entries: ExecutionLogEntry[]): ExecutionLogEntry[] {
+  if (entries.length <= 1) return entries;
+
+  const merged: ExecutionLogEntry[] = [];
+  let current = { ...entries[0] };
+
+  for (let i = 1; i < entries.length; i++) {
+    if (entries[i].type === 'thinking' && current.type === 'thinking') {
+      current.text = (current.text + ' ' + entries[i].text).slice(0, 800);
+    } else {
+      merged.push(current);
+      current = { ...entries[i] };
+    }
+  }
+  merged.push(current);
+
+  return merged;
+}
+
+/**
+ * 从完整的 reasoning_content 一次性提取摘要级日志
+ */
+export function parseFullReasoning(fullText: string): ExecutionLogEntry[] {
+  if (!fullText) return [];
+  return parseReasoningIncrement(fullText, 0);
+}
--- a/backend/src/modules/asl/utils/resultParser.ts
+++ b/backend/src/modules/asl/utils/resultParser.ts
@@ -0,0 +1,113 @@
+/**
+ * Deep Research V2.0 — 结果解析器
+ *
+ * 职责：
+ * 1. 从 Unifuncs content 中切割出 synthesisReport + resultList
+ * 2. safeParseJsonList: 4 层防崩溃 JSON 解析
+ */
+
+import { logger } from '../../../common/logging/index.js';
+
+export interface LiteratureItem {
+  title: string;
+  authors?: string;
+  journal?: string;
+  year?: number | string;
+  doi?: string;
+  pmid?: string;
+  url?: string;
+  abstract?: string;
+  studyType?: string;
+}
+
+/**
+ * 从 Unifuncs 返回的 content 中拆分报告和文献列表
+ */
+export function parseContent(content: string): {
+  synthesisReport: string;
+  resultList: LiteratureItem[] | null;
+} {
+  if (!content) {
+    return { synthesisReport: '', resultList: null };
+  }
+
+  const jsonBlockMatch = content.match(/```json\s*([\s\S]*?)```/);
+  
+  if (jsonBlockMatch) {
+    const beforeJson = content.slice(0, content.indexOf('```json')).trim();
+    const jsonRaw = jsonBlockMatch[1];
+    
+    const resultList = safeParseJsonList(jsonRaw);
+    
+    const afterJsonEnd = content.indexOf('```', content.indexOf('```json') + 7) + 3;
+    const afterJson = content.slice(afterJsonEnd).trim();
+    
+    const synthesisReport = (beforeJson + (afterJson ? '\n\n' + afterJson : '')).trim();
+    
+    return { synthesisReport: synthesisReport || content, resultList };
+  }
+
+  const links = extractPubMedLinks(content);
+  if (links.length > 0) {
+    const resultList: LiteratureItem[] = links.map(url => ({
+      title: '',
+      url,
+      pmid: extractPmidFromUrl(url) || undefined,
+    }));
+    return { synthesisReport: content, resultList };
+  }
+
+  return { synthesisReport: content, resultList: null };
+}
+
+/**
+ * 4 层防崩溃 JSON 解析
+ */
+export function safeParseJsonList(raw: string | null): LiteratureItem[] | null {
+  if (!raw) return null;
+
+  let cleaned = raw.replace(/```json\s*/gi, '').replace(/```\s*/g, '');
+
+  cleaned = cleaned.replace(/,\s*([}\]])/g, '$1');
+
+  try {
+    const parsed = JSON.parse(cleaned);
+    return Array.isArray(parsed) ? parsed : [parsed];
+  } catch {
+    logger.warn('[resultParser] Standard JSON.parse failed, trying regex extraction');
+  }
+
+  const objects: any[] = [];
+  const regex = /\{[^{}]*\}/g;
+  let match;
+  while ((match = regex.exec(cleaned)) !== null) {
+    try {
+      objects.push(JSON.parse(match[0]));
+    } catch {
+      // skip unparseable fragment
+    }
+  }
+
+  if (objects.length > 0) {
+    logger.info('[resultParser] Regex extraction recovered items', { count: objects.length });
+    return objects;
+  }
+
+  logger.warn('[resultParser] All parsing strategies failed');
+  return null;
+}
+
+function extractPubMedLinks(content: string): string[] {
+  const linkSet = new Set<string>();
+  const pattern = /https?:\/\/pubmed\.ncbi\.nlm\.nih\.gov\/(\d+)\/?/gi;
+  let match;
+  while ((match = pattern.exec(content)) !== null) {
+    linkSet.add(`https://pubmed.ncbi.nlm.nih.gov/${match[1]}/`);
+  }
+  return Array.from(linkSet);
+}
+
+function extractPmidFromUrl(url: string): string | null {
+  const m = url.match(/pubmed\.ncbi\.nlm\.nih\.gov\/(\d+)/);
+  return m ? m[1] : null;
+}