Files
AIclinicalresearch/extraction_service/services/doc_export_service.py
HaHafeng 303dd78c54 feat(aia): Protocol Agent MVP complete with one-click generation and Word export
- Add one-click research protocol generation with streaming output

- Implement Word document export via Pandoc integration

- Add dynamic dual-panel layout with resizable split pane

- Implement collapsible content for StatePanel stages

- Add conversation history management with title auto-update

- Fix scroll behavior, markdown rendering, and UI layout issues

- Simplify conversation creation logic for reliability
2026-01-25 19:16:36 +08:00

219 lines
6.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
文档导出服务 - Markdown 转 Word
功能:
- Markdown → Docx 转换(使用 Pandoc
- 支持自定义 Word 模板Reference Doc
- 保证输出格式符合伦理委员会要求
"""
import os
import tempfile
from pathlib import Path
from typing import Optional
from loguru import logger
# 尝试导入 pypandoc
try:
import pypandoc
PANDOC_AVAILABLE = True
except ImportError:
PANDOC_AVAILABLE = False
logger.warning("pypandoc 未安装Word 导出功能不可用")
# 模板目录
ASSETS_DIR = Path(__file__).parent / "assets"
DEFAULT_TEMPLATE = ASSETS_DIR / "protocol_template.docx"
def check_pandoc_available() -> dict:
"""
检查 Pandoc 是否可用
Returns:
{
"available": bool,
"version": str,
"message": str
}
"""
if not PANDOC_AVAILABLE:
return {
"available": False,
"version": None,
"message": "pypandoc 未安装,请运行: pip install pypandoc"
}
try:
version = pypandoc.get_pandoc_version()
return {
"available": True,
"version": version,
"message": f"Pandoc {version} 已就绪"
}
except OSError as e:
return {
"available": False,
"version": None,
"message": f"Pandoc 未安装或不在 PATH 中: {str(e)}。请安装 Pandoc: https://pandoc.org/installing.html"
}
def convert_markdown_to_docx(
markdown_text: str,
output_path: Optional[str] = None,
use_template: bool = True,
template_path: Optional[str] = None
) -> dict:
"""
将 Markdown 文本转换为 Word 文档
Args:
markdown_text: Markdown 格式的文本
output_path: 输出文件路径(可选,不提供则创建临时文件)
use_template: 是否使用模板
template_path: 自定义模板路径(可选)
Returns:
{
"success": bool,
"output_path": str, # 生成的文件路径
"file_size": int, # 文件大小(字节)
"message": str,
"error": str # 仅在失败时存在
}
"""
# 检查 Pandoc 可用性
pandoc_status = check_pandoc_available()
if not pandoc_status["available"]:
return {
"success": False,
"output_path": None,
"file_size": 0,
"message": pandoc_status["message"],
"error": "Pandoc 不可用"
}
try:
# 确定输出路径
if output_path is None:
# 创建临时文件
fd, output_path = tempfile.mkstemp(suffix='.docx')
os.close(fd)
# 构建 Pandoc 参数
extra_args = []
# 使用模板
if use_template:
if template_path and Path(template_path).exists():
extra_args.append(f'--reference-doc={template_path}')
elif DEFAULT_TEMPLATE.exists():
extra_args.append(f'--reference-doc={DEFAULT_TEMPLATE}')
logger.info(f"使用默认模板: {DEFAULT_TEMPLATE}")
else:
logger.warning("未找到 Word 模板,将使用 Pandoc 默认样式")
# 注意不自动添加目录TOC因为
# 1. Pandoc 的 TOC 标题是 "Table of Contents",不符合中文要求
# 2. 研究方案正文已有章节结构,无需额外目录
# 如需目录,用户可在 Word 中手动插入
# extra_args.append('--toc')
# extra_args.append('--toc-depth=3')
logger.info(f"开始转换 Markdown → Docx, 文本长度: {len(markdown_text)} 字符")
# 执行转换
pypandoc.convert_text(
markdown_text,
'docx',
format='markdown',
outputfile=output_path,
extra_args=extra_args
)
# 获取文件大小
file_size = os.path.getsize(output_path)
logger.info(f"转换成功: {output_path}, 大小: {file_size} bytes")
return {
"success": True,
"output_path": output_path,
"file_size": file_size,
"message": f"成功生成 Word 文档 ({file_size} bytes)"
}
except Exception as e:
logger.error(f"Markdown → Docx 转换失败: {str(e)}")
return {
"success": False,
"output_path": None,
"file_size": 0,
"message": "转换失败",
"error": str(e)
}
def create_protocol_docx(
sections: dict,
output_path: Optional[str] = None,
title: str = "临床研究方案"
) -> dict:
"""
根据分章节内容生成研究方案 Word 文档
Args:
sections: 章节内容字典
{
"title": "研究题目",
"background": "研究背景内容...",
"objectives": "研究目的内容...",
...
}
output_path: 输出文件路径
title: 文档标题
Returns:
转换结果
"""
# 章节配置(顺序和标题)
section_config = [
("title", "1. 研究题目"),
("background", "2. 研究背景与立题依据"),
("objectives", "3. 研究目的"),
("design", "4. 研究设计"),
("subjects", "5. 研究对象(纳入/排除标准)"),
("sample_size", "6. 样本量估算"),
("implementation", "7. 研究实施步骤与技术路线"),
("endpoints", "8. 观察指标"),
("data_management", "9. 数据管理与质量控制"),
("safety", "10. 安全性评价"),
("statistics", "11. 统计分析计划"),
("ethics", "12. 伦理与知情同意"),
("timeline", "13. 研究时间表"),
("references", "14. 参考文献"),
]
# 组装 Markdown
markdown_parts = [f"# {title}\n\n"]
for key, heading in section_config:
content = sections.get(key, "")
if content:
markdown_parts.append(f"## {heading}\n\n{content}\n\n")
markdown_text = "".join(markdown_parts)
return convert_markdown_to_docx(markdown_text, output_path)
# 导出函数
__all__ = [
"check_pandoc_available",
"convert_markdown_to_docx",
"create_protocol_docx",
]