Sprint 1-3 Completed (Backend + Frontend): Backend (Sprint 1-2): - Implement 5-layer Agent framework (Query->Planner->Executor->Tools->Reflection) - Create agent_schema with 6 tables (agent_definitions, stages, prompts, sessions, traces, reflexion_rules) - Create protocol_schema with 2 tables (protocol_contexts, protocol_generations) - Implement Protocol Agent core services (Orchestrator, ContextService, PromptBuilder) - Integrate LLM service adapter (DeepSeek/Qwen/GPT-5/Claude) - 6 API endpoints with full authentication - 10/10 API tests passed Frontend (Sprint 3): - Add Protocol Agent entry in AgentHub (indigo theme card) - Implement ProtocolAgentPage with 3-column layout - Collapsible sidebar (Gemini style, 48px <-> 280px) - StatePanel with 5 stage cards (scientific_question, pico, study_design, sample_size, endpoints) - ChatArea with sync button and action cards integration - 100% prototype design restoration (608 lines CSS) - Detailed endpoints structure: baseline, exposure, outcomes, confounders Features: - 5-stage dialogue flow for research protocol design - Conversation-driven interaction with sync-to-protocol button - Real-time context state management - One-click protocol generation button (UI ready, backend pending) Database: - agent_schema: 6 tables for reusable Agent framework - protocol_schema: 2 tables for Protocol Agent - Seed data: 1 agent + 5 stages + 9 prompts + 4 reflexion rules Code Stats: - Backend: 13 files, 4338 lines - Frontend: 14 files, 2071 lines - Total: 27 files, 6409 lines Status: MVP core functionality completed, pending frontend-backend integration testing Next: Sprint 4 - One-click protocol generation + Word export
153 lines
4.6 KiB
Python
153 lines
4.6 KiB
Python
"""
|
||
PDF Markdown 处理器 - 基于 pymupdf4llm
|
||
|
||
特点:
|
||
- 输出 LLM 友好的 Markdown 格式
|
||
- 完整保留表格结构
|
||
- 自动检测扫描件并返回友好提示
|
||
- 零 OCR,只处理电子版 PDF
|
||
"""
|
||
|
||
import pymupdf4llm
|
||
from pathlib import Path
|
||
from typing import Dict, Any, Optional, List
|
||
from loguru import logger
|
||
|
||
|
||
class PdfMarkdownProcessor:
|
||
"""PDF → Markdown 处理器"""
|
||
|
||
# 扫描件检测阈值:提取文本少于此字符数视为扫描件
|
||
MIN_TEXT_THRESHOLD = 50
|
||
|
||
def __init__(self, image_dir: str = "./images"):
|
||
self.image_dir = image_dir
|
||
|
||
def to_markdown(
|
||
self,
|
||
pdf_path: str,
|
||
page_chunks: bool = False,
|
||
extract_images: bool = False,
|
||
dpi: int = 150
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
PDF 转 Markdown(仅支持电子版)
|
||
|
||
Args:
|
||
pdf_path: PDF 文件路径
|
||
page_chunks: 是否按页分块
|
||
extract_images: 是否提取图片(默认关闭,节省空间)
|
||
dpi: 图片分辨率
|
||
|
||
Returns:
|
||
{
|
||
"success": True,
|
||
"markdown": "Markdown 文本",
|
||
"metadata": { "page_count": 10, "char_count": 5000 },
|
||
"is_scanned": False
|
||
}
|
||
"""
|
||
filename = Path(pdf_path).name
|
||
|
||
try:
|
||
logger.info(f"开始使用 pymupdf4llm 处理: {filename}")
|
||
|
||
# 调用 pymupdf4llm
|
||
md_text = pymupdf4llm.to_markdown(
|
||
pdf_path,
|
||
page_chunks=page_chunks,
|
||
write_images=extract_images,
|
||
image_path=self.image_dir if extract_images else None,
|
||
dpi=dpi,
|
||
show_progress=False
|
||
)
|
||
|
||
# 如果返回的是列表(page_chunks=True),合并为字符串
|
||
if isinstance(md_text, list):
|
||
md_text = "\n\n---\n\n".join([
|
||
f"## Page {i+1}\n\n{page.get('text', '')}"
|
||
for i, page in enumerate(md_text)
|
||
])
|
||
|
||
char_count = len(md_text.strip())
|
||
|
||
# 质量检查:检测是否为扫描件
|
||
if char_count < self.MIN_TEXT_THRESHOLD:
|
||
logger.warning(f"PDF 文本过少 ({char_count} 字符),可能为扫描件: {filename}")
|
||
return {
|
||
"success": True,
|
||
"markdown": self._scan_pdf_hint(filename, char_count),
|
||
"metadata": {
|
||
"page_count": self._get_page_count(pdf_path),
|
||
"char_count": char_count,
|
||
"is_scanned": True
|
||
},
|
||
"is_scanned": True
|
||
}
|
||
|
||
# 获取页数
|
||
page_count = self._get_page_count(pdf_path)
|
||
|
||
logger.info(f"PDF 处理完成: {page_count} 页, {char_count} 字符")
|
||
|
||
return {
|
||
"success": True,
|
||
"markdown": md_text,
|
||
"metadata": {
|
||
"page_count": page_count,
|
||
"char_count": char_count,
|
||
"is_scanned": False
|
||
},
|
||
"is_scanned": False
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error(f"PDF 解析失败: {filename}, 错误: {e}")
|
||
return {
|
||
"success": False,
|
||
"error": str(e),
|
||
"markdown": f"> **系统提示**:文档 `{filename}` 解析失败: {str(e)}"
|
||
}
|
||
|
||
def _get_page_count(self, pdf_path: str) -> int:
|
||
"""获取 PDF 页数"""
|
||
try:
|
||
import fitz # pymupdf
|
||
doc = fitz.open(pdf_path)
|
||
count = len(doc)
|
||
doc.close()
|
||
return count
|
||
except:
|
||
return 0
|
||
|
||
def _scan_pdf_hint(self, filename: str, char_count: int) -> str:
|
||
"""生成扫描件友好提示"""
|
||
return f"""> **系统提示**:文档 `{filename}` 似乎是扫描件(图片型 PDF)。
|
||
>
|
||
> - 提取文本量:{char_count} 字符
|
||
> - 本系统暂不支持扫描版 PDF 的文字识别
|
||
> - 建议:请上传电子版 PDF,或将扫描件转换为可编辑格式后重新上传"""
|
||
|
||
|
||
# 便捷函数
|
||
def extract_pdf_to_markdown(pdf_path: str) -> Dict[str, Any]:
|
||
"""
|
||
PDF 转 Markdown(便捷函数)
|
||
|
||
Args:
|
||
pdf_path: PDF 文件路径
|
||
|
||
Returns:
|
||
处理结果字典
|
||
"""
|
||
processor = PdfMarkdownProcessor()
|
||
return processor.to_markdown(pdf_path)
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|