"""
Txt文本文件提取服务

直接读取纯文本文件
支持多种编码自动检测
"""

from pathlib import Path
from typing import Dict, Any, List
from loguru import logger
import chardet


def extract_txt(file_path: str) -> Dict[str, Any]:
    """
    提取Txt文件内容
    
    特性:
    - 自动检测编码（UTF-8, GBK, GB2312等）
    - 支持大文件（逐块读取）
    - 去除BOM标记
    
    Args:
        file_path: Txt文件路径
    
    Returns:
        {
            "success": True,
            "text": "文本内容",
            "encoding": "检测到的编码",
            "metadata": {
                "char_count": 字符数,
                "line_count": 行数,
                "file_size": 文件大小
            }
        }
    """
    try:
        file_path_obj = Path(file_path)
        
        # 验证文件存在
        if not file_path_obj.exists():
            return {
                "success": False,
                "error": f"文件不存在: {file_path}",
                "text": "",
                "metadata": {}
            }
        
        # 验证文件格式
        if file_path_obj.suffix.lower() != '.txt':
            return {
                "success": False,
                "error": f"不支持的文件格式: {file_path_obj.suffix}，仅支持.txt",
                "text": "",
                "metadata": {}
            }
        
        file_size = file_path_obj.stat().st_size
        
        # 空文件检查
        if file_size == 0:
            return {
                "success": False,
                "error": "文件为空",
                "text": "",
                "metadata": {
                    "char_count": 0,
                    "line_count": 0,
                    "file_size": 0
                }
            }
        
        logger.info(f"开始提取Txt文件: {file_path_obj.name} ({file_size / 1024:.2f} KB)")
        
        # 检测编码
        detected_encoding = detect_encoding(file_path)
        logger.info(f"检测到编码: {detected_encoding}")
        
        # 读取文件（带编码回退）
        text, actual_encoding = read_with_fallback(file_path, detected_encoding)
        
        if text is None:
            return {
                "success": False,
                "error": "无法解码文件，尝试了多种编码均失败",
                "text": "",
                "metadata": {}
            }
        
        # 统计信息
        char_count = len(text)
        line_count = text.count('\n') + 1
        
        logger.info(f"Txt提取成功: {char_count}个字符, {line_count}行")
        
        return {
            "success": True,
            "text": text,
            "encoding": actual_encoding,
            "metadata": {
                "char_count": char_count,
                "line_count": line_count,
                "file_size": file_size,
                "size_kb": round(file_size / 1024, 2)
            }
        }
    
    except Exception as e:
        logger.error(f"Txt提取失败: {str(e)}")
        return {
            "success": False,
            "error": str(e),
            "text": "",
            "metadata": {}
        }


def detect_encoding(file_path: str, sample_size: int = 10000) -> str:
    """
    检测文件编码
    
    Args:
        file_path: 文件路径
        sample_size: 采样大小（字节）
    
    Returns:
        检测到的编码名称
    """
    try:
        with open(file_path, 'rb') as f:
            raw_data = f.read(sample_size)
        
        # 使用chardet检测
        result = chardet.detect(raw_data)
        encoding = result['encoding']
        confidence = result['confidence']
        
        logger.debug(f"编码检测: {encoding} (置信度: {confidence:.2f})")
        
        # 如果置信度太低，使用UTF-8作为默认
        if confidence < 0.7:
            logger.warning(f"编码置信度较低({confidence:.2f})，将尝试UTF-8")
            return 'utf-8'
        
        return encoding if encoding else 'utf-8'
    
    except Exception as e:
        logger.warning(f"编码检测失败: {str(e)}，使用UTF-8")
        return 'utf-8'


def read_with_fallback(file_path: str, primary_encoding: str) -> tuple[str, str]:
    """
    尝试多种编码读取文件
    
    Args:
        file_path: 文件路径
        primary_encoding: 首选编码
    
    Returns:
        (文本内容, 实际使用的编码)
    """
    # 编码尝试列表（按优先级）
    encodings = [
        primary_encoding,
        'utf-8',
        'utf-8-sig',  # UTF-8 with BOM
        'gbk',
        'gb2312',
        'gb18030',
        'latin-1',
        'cp1252',
        'iso-8859-1'
    ]
    
    # 去重并保持顺序
    seen = set()
    unique_encodings = []
    for enc in encodings:
        if enc and enc.lower() not in seen:
            seen.add(enc.lower())
            unique_encodings.append(enc)
    
    # 尝试每种编码
    for encoding in unique_encodings:
        try:
            with open(file_path, 'r', encoding=encoding, errors='strict') as f:
                text = f.read()
            
            logger.info(f"成功使用编码: {encoding}")
            return text, encoding
        
        except UnicodeDecodeError:
            logger.debug(f"编码 {encoding} 解码失败，尝试下一个")
            continue
        
        except Exception as e:
            logger.warning(f"使用编码 {encoding} 读取失败: {str(e)}")
            continue
    
    # 所有编码都失败
    logger.error("所有编码尝试均失败")
    return None, None


def validate_txt_file(file_path: str) -> Dict[str, Any]:
    """
    验证Txt文件的有效性
    
    Args:
        file_path: 文件路径
    
    Returns:
        {
            "valid": True/False,
            "reason": "原因",
            "file_info": {文件信息}
        }
    """
    try:
        file_path_obj = Path(file_path)
        
        # 检查文件存在
        if not file_path_obj.exists():
            return {
                "valid": False,
                "reason": "文件不存在"
            }
        
        # 检查后缀
        if file_path_obj.suffix.lower() != '.txt':
            return {
                "valid": False,
                "reason": f"不支持的格式: {file_path_obj.suffix}（仅支持.txt）"
            }
        
        # 检查文件大小（限制10MB，txt文件通常较小）
        file_size = file_path_obj.stat().st_size
        max_size = 10 * 1024 * 1024  # 10MB
        
        if file_size > max_size:
            return {
                "valid": False,
                "reason": f"文件过大: {file_size / 1024 / 1024:.2f}MB（限制10MB）"
            }
        
        if file_size == 0:
            return {
                "valid": False,
                "reason": "文件为空"
            }
        
        # 尝试检测编码
        encoding = detect_encoding(str(file_path_obj))
        
        return {
            "valid": True,
            "reason": "文件有效",
            "file_info": {
                "filename": file_path_obj.name,
                "size": file_size,
                "size_kb": round(file_size / 1024, 2),
                "detected_encoding": encoding
            }
        }
    
    except Exception as e:
        return {
            "valid": False,
            "reason": f"验证失败: {str(e)}"
        }


def preview_txt(file_path: str, lines: int = 10) -> Dict[str, Any]:
    """
    预览Txt文件前几行
    
    Args:
        file_path: 文件路径
        lines: 预览行数
    
    Returns:
        {
            "success": True,
            "preview": "前N行内容",
            "total_lines": 总行数（如果能快速获取）
        }
    """
    try:
        result = extract_txt(file_path)
        
        if not result['success']:
            return result
        
        text = result['text']
        text_lines = text.split('\n')
        
        preview_lines = text_lines[:lines]
        preview = '\n'.join(preview_lines)
        
        return {
            "success": True,
            "preview": preview,
            "total_lines": len(text_lines),
            "preview_lines": len(preview_lines)
        }
    
    except Exception as e:
        return {
            "success": False,
            "error": str(e),
            "preview": ""
        }