Files
AIclinicalresearch/extraction_service/services/pdf_markdown_processor.py
HaHafeng 96290d2f76 feat(aia): Implement Protocol Agent MVP with reusable Agent framework
Sprint 1-3 Completed (Backend + Frontend):

Backend (Sprint 1-2):
- Implement 5-layer Agent framework (Query->Planner->Executor->Tools->Reflection)
- Create agent_schema with 6 tables (agent_definitions, stages, prompts, sessions, traces, reflexion_rules)
- Create protocol_schema with 2 tables (protocol_contexts, protocol_generations)
- Implement Protocol Agent core services (Orchestrator, ContextService, PromptBuilder)
- Integrate LLM service adapter (DeepSeek/Qwen/GPT-5/Claude)
- 6 API endpoints with full authentication
- 10/10 API tests passed

Frontend (Sprint 3):
- Add Protocol Agent entry in AgentHub (indigo theme card)
- Implement ProtocolAgentPage with 3-column layout
- Collapsible sidebar (Gemini style, 48px <-> 280px)
- StatePanel with 5 stage cards (scientific_question, pico, study_design, sample_size, endpoints)
- ChatArea with sync button and action cards integration
- 100% prototype design restoration (608 lines CSS)
- Detailed endpoints structure: baseline, exposure, outcomes, confounders

Features:
- 5-stage dialogue flow for research protocol design
- Conversation-driven interaction with sync-to-protocol button
- Real-time context state management
- One-click protocol generation button (UI ready, backend pending)

Database:
- agent_schema: 6 tables for reusable Agent framework
- protocol_schema: 2 tables for Protocol Agent
- Seed data: 1 agent + 5 stages + 9 prompts + 4 reflexion rules

Code Stats:
- Backend: 13 files, 4338 lines
- Frontend: 14 files, 2071 lines
- Total: 27 files, 6409 lines

Status: MVP core functionality completed, pending frontend-backend integration testing

Next: Sprint 4 - One-click protocol generation + Word export
2026-01-24 17:29:24 +08:00

153 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
PDF Markdown 处理器 - 基于 pymupdf4llm
特点:
- 输出 LLM 友好的 Markdown 格式
- 完整保留表格结构
- 自动检测扫描件并返回友好提示
- 零 OCR只处理电子版 PDF
"""
import pymupdf4llm
from pathlib import Path
from typing import Dict, Any, Optional, List
from loguru import logger
class PdfMarkdownProcessor:
"""PDF → Markdown 处理器"""
# 扫描件检测阈值:提取文本少于此字符数视为扫描件
MIN_TEXT_THRESHOLD = 50
def __init__(self, image_dir: str = "./images"):
self.image_dir = image_dir
def to_markdown(
self,
pdf_path: str,
page_chunks: bool = False,
extract_images: bool = False,
dpi: int = 150
) -> Dict[str, Any]:
"""
PDF 转 Markdown仅支持电子版
Args:
pdf_path: PDF 文件路径
page_chunks: 是否按页分块
extract_images: 是否提取图片(默认关闭,节省空间)
dpi: 图片分辨率
Returns:
{
"success": True,
"markdown": "Markdown 文本",
"metadata": { "page_count": 10, "char_count": 5000 },
"is_scanned": False
}
"""
filename = Path(pdf_path).name
try:
logger.info(f"开始使用 pymupdf4llm 处理: {filename}")
# 调用 pymupdf4llm
md_text = pymupdf4llm.to_markdown(
pdf_path,
page_chunks=page_chunks,
write_images=extract_images,
image_path=self.image_dir if extract_images else None,
dpi=dpi,
show_progress=False
)
# 如果返回的是列表page_chunks=True合并为字符串
if isinstance(md_text, list):
md_text = "\n\n---\n\n".join([
f"## Page {i+1}\n\n{page.get('text', '')}"
for i, page in enumerate(md_text)
])
char_count = len(md_text.strip())
# 质量检查:检测是否为扫描件
if char_count < self.MIN_TEXT_THRESHOLD:
logger.warning(f"PDF 文本过少 ({char_count} 字符),可能为扫描件: {filename}")
return {
"success": True,
"markdown": self._scan_pdf_hint(filename, char_count),
"metadata": {
"page_count": self._get_page_count(pdf_path),
"char_count": char_count,
"is_scanned": True
},
"is_scanned": True
}
# 获取页数
page_count = self._get_page_count(pdf_path)
logger.info(f"PDF 处理完成: {page_count} 页, {char_count} 字符")
return {
"success": True,
"markdown": md_text,
"metadata": {
"page_count": page_count,
"char_count": char_count,
"is_scanned": False
},
"is_scanned": False
}
except Exception as e:
logger.error(f"PDF 解析失败: {filename}, 错误: {e}")
return {
"success": False,
"error": str(e),
"markdown": f"> **系统提示**:文档 `{filename}` 解析失败: {str(e)}"
}
def _get_page_count(self, pdf_path: str) -> int:
"""获取 PDF 页数"""
try:
import fitz # pymupdf
doc = fitz.open(pdf_path)
count = len(doc)
doc.close()
return count
except:
return 0
def _scan_pdf_hint(self, filename: str, char_count: int) -> str:
"""生成扫描件友好提示"""
return f"""> **系统提示**:文档 `{filename}` 似乎是扫描件(图片型 PDF
>
> - 提取文本量:{char_count} 字符
> - 本系统暂不支持扫描版 PDF 的文字识别
> - 建议:请上传电子版 PDF或将扫描件转换为可编辑格式后重新上传"""
# 便捷函数
def extract_pdf_to_markdown(pdf_path: str) -> Dict[str, Any]:
"""
PDF 转 Markdown便捷函数
Args:
pdf_path: PDF 文件路径
Returns:
处理结果字典
"""
processor = PdfMarkdownProcessor()
return processor.to_markdown(pdf_path)