Files

258 lines
7.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Docx文档提取服务
使用Mammoth库提取Word文档文本
支持.docx格式不支持老版.doc
"""
import mammoth
from pathlib import Path
from typing import Dict, Any
from loguru import logger
def extract_docx_mammoth(file_path: str) -> Dict[str, Any]:
"""
使用Mammoth提取Docx文本
Mammoth特点:
- 转换为纯文本或HTML
- 保留基本格式信息
- 处理表格、列表等结构
Args:
file_path: Docx文件路径
Returns:
{
"success": True,
"text": "提取的文本内容",
"format": "plain_text",
"metadata": {
"char_count": 字符数,
"has_tables": 是否包含表格,
"file_size": 文件大小
}
}
"""
try:
file_path_obj = Path(file_path)
# 验证文件存在
if not file_path_obj.exists():
return {
"success": False,
"error": f"文件不存在: {file_path}",
"text": "",
"metadata": {}
}
# 验证文件格式
if file_path_obj.suffix.lower() != '.docx':
return {
"success": False,
"error": f"不支持的文件格式: {file_path_obj.suffix},仅支持.docx",
"text": "",
"metadata": {}
}
logger.info(f"开始提取Docx文件: {file_path_obj.name}")
# 使用Mammoth提取纯文本
with open(file_path, "rb") as docx_file:
result = mammoth.extract_raw_text(docx_file)
text = result.value # 提取的文本
messages = result.messages # 警告/错误信息
# 检查是否有警告
if messages:
logger.warning(f"Mammoth提取警告: {len(messages)}")
for msg in messages:
logger.debug(f" - {msg.type}: {msg.message}")
# 简单的质量检查
char_count = len(text)
if char_count == 0:
logger.warning("提取的文本为空")
return {
"success": False,
"error": "文档内容为空或无法提取",
"text": "",
"metadata": {
"char_count": 0,
"file_size": file_path_obj.stat().st_size
}
}
# 简单判断是否包含表格(通过制表符或特殊结构)
has_tables = '\t' in text or '|' in text
logger.info(f"Docx提取成功: {char_count}个字符")
return {
"success": True,
"text": text,
"format": "plain_text",
"metadata": {
"char_count": char_count,
"has_tables": has_tables,
"file_size": file_path_obj.stat().st_size,
"warnings": len(messages)
}
}
except Exception as e:
logger.error(f"Docx提取失败: {str(e)}")
return {
"success": False,
"error": str(e),
"text": "",
"metadata": {}
}
def extract_docx_html(file_path: str) -> Dict[str, Any]:
"""
使用Mammoth提取Docx为HTML格式保留更多格式
Args:
file_path: Docx文件路径
Returns:
{
"success": True,
"html": "HTML格式的文本",
"format": "html",
"metadata": {...}
}
"""
try:
file_path_obj = Path(file_path)
if not file_path_obj.exists():
return {
"success": False,
"error": f"文件不存在: {file_path}",
"html": "",
"metadata": {}
}
logger.info(f"开始提取Docx为HTML: {file_path_obj.name}")
# 提取为HTML
with open(file_path, "rb") as docx_file:
result = mammoth.convert_to_html(docx_file)
html = result.value
messages = result.messages
if messages:
logger.warning(f"HTML转换警告: {len(messages)}")
logger.info(f"HTML提取成功: {len(html)}个字符")
return {
"success": True,
"html": html,
"format": "html",
"metadata": {
"html_length": len(html),
"file_size": file_path_obj.stat().st_size,
"warnings": len(messages)
}
}
except Exception as e:
logger.error(f"HTML提取失败: {str(e)}")
return {
"success": False,
"error": str(e),
"html": "",
"metadata": {}
}
def validate_docx_file(file_path: str) -> Dict[str, Any]:
"""
验证Docx文件的有效性
Args:
file_path: 文件路径
Returns:
{
"valid": True/False,
"reason": "原因",
"file_info": {文件信息}
}
"""
try:
file_path_obj = Path(file_path)
# 检查文件存在
if not file_path_obj.exists():
return {
"valid": False,
"reason": "文件不存在"
}
# 检查后缀
if file_path_obj.suffix.lower() != '.docx':
return {
"valid": False,
"reason": f"不支持的格式: {file_path_obj.suffix}(仅支持.docx"
}
# 检查文件大小限制50MB
file_size = file_path_obj.stat().st_size
max_size = 50 * 1024 * 1024 # 50MB
if file_size > max_size:
return {
"valid": False,
"reason": f"文件过大: {file_size / 1024 / 1024:.2f}MB限制50MB"
}
if file_size == 0:
return {
"valid": False,
"reason": "文件为空"
}
# 尝试打开文件(基本有效性检查)
try:
with open(file_path, "rb") as f:
# 读取前4个字节检查ZIP签名docx本质是ZIP文件
signature = f.read(4)
if signature != b'PK\x03\x04':
return {
"valid": False,
"reason": "不是有效的Docx文件ZIP签名错误"
}
except Exception as e:
return {
"valid": False,
"reason": f"无法读取文件: {str(e)}"
}
return {
"valid": True,
"reason": "文件有效",
"file_info": {
"filename": file_path_obj.name,
"size": file_size,
"size_mb": round(file_size / 1024 / 1024, 2)
}
}
except Exception as e:
return {
"valid": False,
"reason": f"验证失败: {str(e)}"
}