feat(aia): Protocol Agent MVP complete with one-click generation and Word export

- Add one-click research protocol generation with streaming output - Implement Word document export via Pandoc integration - Add dynamic dual-panel layout with resizable split pane - Implement collapsible content for StatePanel stages - Add conversation history management with title auto-update - Fix scroll behavior, markdown rendering, and UI layout issues - Simplify conversation creation logic for reliability
2026-01-25 19:16:36 +08:00
parent 4d7d97ca19
commit 303dd78c54
332 changed files with 6204 additions and 617 deletions
--- a/extraction_service/.dockerignore
+++ b/extraction_service/.dockerignore
@@ -90,5 +90,6 @@ models/



+


--- a/extraction_service/main.py
+++ b/extraction_service/main.py
@@ -9,7 +9,7 @@
 - 健康检查
 """

-from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi import FastAPI, File, UploadFile, HTTPException, Response
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
@@ -63,6 +63,8 @@ from services.dc_executor import validate_code, execute_pandas_code
 # 新增：统一文档处理器（RAG 引擎使用）
 from services.document_processor import DocumentProcessor, convert_to_markdown
 from services.pdf_markdown_processor import PdfMarkdownProcessor, extract_pdf_to_markdown
+# 新增：文档导出服务（Markdown → Word）
+from services.doc_export_service import check_pandoc_available, convert_markdown_to_docx, create_protocol_docx

 # 兼容：nougat 相关（已废弃，保留空实现避免报错）
 def check_nougat_available(): return False
@@ -243,6 +245,19 @@ class FillnaMiceRequest(BaseModel):
    random_state: int = 42


+class MarkdownToDocxRequest(BaseModel):
+    """Markdown转Word请求模型"""
+    content: str  # Markdown 内容
+    use_template: bool = True  # 是否使用模板
+    title: str = "临床研究方案"  # 文档标题
+
+
+class ProtocolToDocxRequest(BaseModel):
+    """研究方案转Word请求模型"""
+    sections: Dict[str, str]  # 章节内容
+    title: str = "临床研究方案"  # 文档标题
+
+
 # ==================== API路由 ====================

@app.get("/")
@@ -2106,6 +2121,160 @@ async def operation_fillna_mice(request: FillnaMiceRequest):
        }, status_code=400)


+# ==================== Word 导出 API ====================
+
+@app.get("/api/pandoc/status")
+async def pandoc_status():
+    """
+    检查 Pandoc 可用性
+    
+    Returns:
+        {
+            "available": bool,
+            "version": str,
+            "message": str
+        }
+    """
+    try:
+        result = check_pandoc_available()
+        logger.info(f"Pandoc 状态检查: {result}")
+        return JSONResponse(content=result)
+    except Exception as e:
+        logger.error(f"Pandoc 状态检查失败: {str(e)}")
+        return JSONResponse(content={
+            "available": False,
+            "version": None,
+            "message": f"检查失败: {str(e)}"
+        })
+
+
+@app.post("/api/convert/docx")
+async def convert_to_docx(request: MarkdownToDocxRequest):
+    """
+    Markdown 转 Word 接口
+    
+    将 Markdown 文本转换为 Word 文档（.docx）
+    
+    Args:
+        request: MarkdownToDocxRequest
+            - content: Markdown 内容
+            - use_template: 是否使用模板（默认 True）
+            - title: 文档标题
+    
+    Returns:
+        Word 文档二进制数据（application/vnd.openxmlformats-officedocument.wordprocessingml.document）
+    """
+    try:
+        logger.info(f"开始转换 Markdown → Word, 内容长度: {len(request.content)} 字符")
+        
+        # 执行转换
+        result = convert_markdown_to_docx(
+            markdown_text=request.content,
+            use_template=request.use_template
+        )
+        
+        if not result["success"]:
+            logger.error(f"转换失败: {result.get('error', 'Unknown error')}")
+            raise HTTPException(
+                status_code=500,
+                detail=result.get("error", "转换失败")
+            )
+        
+        # 读取生成的文件
+        output_path = result["output_path"]
+        with open(output_path, 'rb') as f:
+            content = f.read()
+        
+        # 清理临时文件
+        try:
+            os.remove(output_path)
+        except Exception as e:
+            logger.warning(f"清理临时文件失败: {e}")
+        
+        logger.info(f"Markdown → Word 转换成功, 文件大小: {len(content)} bytes")
+        
+        # 返回文件
+        return Response(
+            content=content,
+            media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            headers={
+                "Content-Disposition": f'attachment; filename="document.docx"'
+            }
+        )
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Markdown → Word 转换失败: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"转换失败: {str(e)}"
+        )
+
+
+@app.post("/api/protocol/export/docx")
+async def export_protocol_to_docx(request: ProtocolToDocxRequest):
+    """
+    研究方案导出为 Word 接口
+    
+    将分章节的研究方案内容导出为格式化的 Word 文档
+    
+    Args:
+        request: ProtocolToDocxRequest
+            - sections: 章节内容字典
+            - title: 文档标题
+    
+    Returns:
+        Word 文档二进制数据
+    """
+    try:
+        logger.info(f"开始导出研究方案, 章节数: {len(request.sections)}")
+        
+        # 执行转换
+        result = create_protocol_docx(
+            sections=request.sections,
+            title=request.title
+        )
+        
+        if not result["success"]:
+            logger.error(f"导出失败: {result.get('error', 'Unknown error')}")
+            raise HTTPException(
+                status_code=500,
+                detail=result.get("error", "导出失败")
+            )
+        
+        # 读取生成的文件
+        output_path = result["output_path"]
+        with open(output_path, 'rb') as f:
+            content = f.read()
+        
+        # 清理临时文件
+        try:
+            os.remove(output_path)
+        except Exception as e:
+            logger.warning(f"清理临时文件失败: {e}")
+        
+        logger.info(f"研究方案导出成功, 文件大小: {len(content)} bytes")
+        
+        # 返回文件
+        return Response(
+            content=content,
+            media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            headers={
+                "Content-Disposition": f'attachment; filename="research_protocol.docx"'
+            }
+        )
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"研究方案导出失败: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"导出失败: {str(e)}"
+        )
+
+
 # ==================== 启动配置 ====================

 if __name__ == "__main__":
--- a/extraction_service/operations/init.py
+++ b/extraction_service/operations/init.py
@@ -78,5 +78,6 @@ __version__ = '1.0.0'



+


--- a/extraction_service/operations/dropna.py
+++ b/extraction_service/operations/dropna.py
@@ -211,5 +211,6 @@ def get_missing_summary(df: pd.DataFrame) -> dict:



+


--- a/extraction_service/operations/filter.py
+++ b/extraction_service/operations/filter.py
@@ -171,5 +171,6 @@ def apply_filter(



+


--- a/extraction_service/operations/unpivot.py
+++ b/extraction_service/operations/unpivot.py
@@ -335,5 +335,6 @@ def get_unpivot_preview(



+


--- a/extraction_service/requirements.txt
+++ b/extraction_service/requirements.txt
@@ -10,6 +10,7 @@ pdfplumber==0.10.3          # 备用 PDF 处理
 # Word处理
 mammoth==1.6.0              # Docx → Markdown
 python-docx==1.1.0          # Docx 读取
+pypandoc>=1.13              # Markdown → Docx (需要系统安装 pandoc)

 # Excel/CSV处理
 pandas>=2.0.0               # 表格处理
--- a/extraction_service/services/doc_export_service.py
+++ b/extraction_service/services/doc_export_service.py
@@ -0,0 +1,218 @@
+"""
+文档导出服务 - Markdown 转 Word
+
+功能：
+- Markdown → Docx 转换（使用 Pandoc）
+- 支持自定义 Word 模板（Reference Doc）
+- 保证输出格式符合伦理委员会要求
+"""
+
+import os
+import tempfile
+from pathlib import Path
+from typing import Optional
+from loguru import logger
+
+# 尝试导入 pypandoc
+try:
+    import pypandoc
+    PANDOC_AVAILABLE = True
+except ImportError:
+    PANDOC_AVAILABLE = False
+    logger.warning("pypandoc 未安装，Word 导出功能不可用")
+
+
+# 模板目录
+ASSETS_DIR = Path(__file__).parent / "assets"
+DEFAULT_TEMPLATE = ASSETS_DIR / "protocol_template.docx"
+
+
+def check_pandoc_available() -> dict:
+    """
+    检查 Pandoc 是否可用
+    
+    Returns:
+        {
+            "available": bool,
+            "version": str,
+            "message": str
+        }
+    """
+    if not PANDOC_AVAILABLE:
+        return {
+            "available": False,
+            "version": None,
+            "message": "pypandoc 未安装，请运行: pip install pypandoc"
+        }
+    
+    try:
+        version = pypandoc.get_pandoc_version()
+        return {
+            "available": True,
+            "version": version,
+            "message": f"Pandoc {version} 已就绪"
+        }
+    except OSError as e:
+        return {
+            "available": False,
+            "version": None,
+            "message": f"Pandoc 未安装或不在 PATH 中: {str(e)}。请安装 Pandoc: https://pandoc.org/installing.html"
+        }
+
+
+def convert_markdown_to_docx(
+    markdown_text: str,
+    output_path: Optional[str] = None,
+    use_template: bool = True,
+    template_path: Optional[str] = None
+) -> dict:
+    """
+    将 Markdown 文本转换为 Word 文档
+    
+    Args:
+        markdown_text: Markdown 格式的文本
+        output_path: 输出文件路径（可选，不提供则创建临时文件）
+        use_template: 是否使用模板
+        template_path: 自定义模板路径（可选）
+    
+    Returns:
+        {
+            "success": bool,
+            "output_path": str,  # 生成的文件路径
+            "file_size": int,    # 文件大小（字节）
+            "message": str,
+            "error": str         # 仅在失败时存在
+        }
+    """
+    # 检查 Pandoc 可用性
+    pandoc_status = check_pandoc_available()
+    if not pandoc_status["available"]:
+        return {
+            "success": False,
+            "output_path": None,
+            "file_size": 0,
+            "message": pandoc_status["message"],
+            "error": "Pandoc 不可用"
+        }
+    
+    try:
+        # 确定输出路径
+        if output_path is None:
+            # 创建临时文件
+            fd, output_path = tempfile.mkstemp(suffix='.docx')
+            os.close(fd)
+        
+        # 构建 Pandoc 参数
+        extra_args = []
+        
+        # 使用模板
+        if use_template:
+            if template_path and Path(template_path).exists():
+                extra_args.append(f'--reference-doc={template_path}')
+            elif DEFAULT_TEMPLATE.exists():
+                extra_args.append(f'--reference-doc={DEFAULT_TEMPLATE}')
+                logger.info(f"使用默认模板: {DEFAULT_TEMPLATE}")
+            else:
+                logger.warning("未找到 Word 模板，将使用 Pandoc 默认样式")
+        
+        # 注意：不自动添加目录（TOC），因为：
+        # 1. Pandoc 的 TOC 标题是 "Table of Contents"，不符合中文要求
+        # 2. 研究方案正文已有章节结构，无需额外目录
+        # 如需目录，用户可在 Word 中手动插入
+        # extra_args.append('--toc')
+        # extra_args.append('--toc-depth=3')
+        
+        logger.info(f"开始转换 Markdown → Docx, 文本长度: {len(markdown_text)} 字符")
+        
+        # 执行转换
+        pypandoc.convert_text(
+            markdown_text,
+            'docx',
+            format='markdown',
+            outputfile=output_path,
+            extra_args=extra_args
+        )
+        
+        # 获取文件大小
+        file_size = os.path.getsize(output_path)
+        
+        logger.info(f"转换成功: {output_path}, 大小: {file_size} bytes")
+        
+        return {
+            "success": True,
+            "output_path": output_path,
+            "file_size": file_size,
+            "message": f"成功生成 Word 文档 ({file_size} bytes)"
+        }
+        
+    except Exception as e:
+        logger.error(f"Markdown → Docx 转换失败: {str(e)}")
+        return {
+            "success": False,
+            "output_path": None,
+            "file_size": 0,
+            "message": "转换失败",
+            "error": str(e)
+        }
+
+
+def create_protocol_docx(
+    sections: dict,
+    output_path: Optional[str] = None,
+    title: str = "临床研究方案"
+) -> dict:
+    """
+    根据分章节内容生成研究方案 Word 文档
+    
+    Args:
+        sections: 章节内容字典
+            {
+                "title": "研究题目",
+                "background": "研究背景内容...",
+                "objectives": "研究目的内容...",
+                ...
+            }
+        output_path: 输出文件路径
+        title: 文档标题
+    
+    Returns:
+        转换结果
+    """
+    # 章节配置（顺序和标题）
+    section_config = [
+        ("title", "1. 研究题目"),
+        ("background", "2. 研究背景与立题依据"),
+        ("objectives", "3. 研究目的"),
+        ("design", "4. 研究设计"),
+        ("subjects", "5. 研究对象（纳入/排除标准）"),
+        ("sample_size", "6. 样本量估算"),
+        ("implementation", "7. 研究实施步骤与技术路线"),
+        ("endpoints", "8. 观察指标"),
+        ("data_management", "9. 数据管理与质量控制"),
+        ("safety", "10. 安全性评价"),
+        ("statistics", "11. 统计分析计划"),
+        ("ethics", "12. 伦理与知情同意"),
+        ("timeline", "13. 研究时间表"),
+        ("references", "14. 参考文献"),
+    ]
+    
+    # 组装 Markdown
+    markdown_parts = [f"# {title}\n\n"]
+    
+    for key, heading in section_config:
+        content = sections.get(key, "")
+        if content:
+            markdown_parts.append(f"## {heading}\n\n{content}\n\n")
+    
+    markdown_text = "".join(markdown_parts)
+    
+    return convert_markdown_to_docx(markdown_text, output_path)
+
+
+# 导出函数
+__all__ = [
+    "check_pandoc_available",
+    "convert_markdown_to_docx",
+    "create_protocol_docx",
+]
+
--- a/extraction_service/services/pdf_markdown_processor.py
+++ b/extraction_service/services/pdf_markdown_processor.py
@@ -150,3 +150,4 @@ def extract_pdf_to_markdown(pdf_path: str) -> Dict[str, Any]:



+
--- a/extraction_service/test_dc_api.py
+++ b/extraction_service/test_dc_api.py
@@ -345,5 +345,6 @@ if __name__ == "__main__":



+


--- a/extraction_service/test_execute_simple.py
+++ b/extraction_service/test_execute_simple.py
@@ -111,5 +111,6 @@ except Exception as e:



+


--- a/extraction_service/test_module.py
+++ b/extraction_service/test_module.py
@@ -91,5 +91,6 @@ except Exception as e:



+
				`@@ -211,5 +211,6 @@ def get_missing_summary(df: pd.DataFrame) -> dict:`
				`@@ -150,3 +150,4 @@ def extract_pdf_to_markdown(pdf_path: str) -> Dict[str, Any]:`