import os
import json
import csv
import xml.etree.ElementTree as ET
from typing import List, Dict, Any
import logging
import PyPDF2
from docx import Document
import openpyxl
from bs4 import BeautifulSoup
from striprtf.striprtf import rtf_to_text
from odf import text as odf_text, teletype
from odf.opendocument import load as odf_load
import xlrd
from pptx import Presentation
import ebooklib
from ebooklib import epub
import textract
logger = logging.getLogger(__name__)
class DocumentParser:

    def __init__(self):
        self.supported_extensions = []
    def can_parse(self, file_path: str) -> bool:

        ext = os.path.splitext(file_path)[1].lower()
        return ext in self.supported_extensions
    def parse(self, file_path: str) -> Dict[str, Any]:

        raise NotImplementedError
class TextParser(DocumentParser):

    def __init__(self):
        super().__init__()
        self.supported_extensions = ['.txt', '.md']
    def parse(self, file_path: str) -> Dict[str, Any]:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            return {
                'content': content,
                'type': 'text',
                'metadata': {
                    'lines': len(content.split('\n')),
                    'words': len(content.split()),
                    'characters': len(content)
                }
            }
        except Exception as e:
            logger.error(f"Ошибка парсинга текстового файла {file_path}: {e}")
            raise
class JSONParser(DocumentParser):

    def __init__(self):
        super().__init__()
        self.supported_extensions = ['.json']
    def parse(self, file_path: str) -> Dict[str, Any]:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            content = self._json_to_text(data)
            return {
                'content': content,
                'type': 'json',
                'metadata': {
                    'keys': len(data) if isinstance(data, dict) else 0,
                    'items': len(data) if isinstance(data, list) else 0
                }
            }
        except Exception as e:
            logger.error(f"Ошибка парсинга JSON файла {file_path}: {e}")
            raise
    def _json_to_text(self, data: Any, indent: int = 0) -> str:

        if isinstance(data, dict):
            result = []
            for key, value in data.items():
                if isinstance(value, (dict, list)):
                    result.append(f"{key}:")
                    result.append(self._json_to_text(value, indent + 1))
                else:
                    result.append(f"{key}: {value}")
            return '\n'.join(['  ' * indent + line for line in result])
        elif isinstance(data, list):
            result = []
            for i, item in enumerate(data):
                if isinstance(item, (dict, list)):
                    result.append(f"Элемент {i + 1}:")
                    result.append(self._json_to_text(item, indent + 1))
                else:
                    result.append(f"Элемент {i + 1}: {item}")
            return '\n'.join(['  ' * indent + line for line in result])
        else:
            return str(data)
class CSVParser(DocumentParser):

    def __init__(self):
        super().__init__()
        self.supported_extensions = ['.csv']
    def parse(self, file_path: str) -> Dict[str, Any]:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                csv_reader = csv.reader(f)
                rows = list(csv_reader)
            if not rows:
                return {'content': '', 'type': 'csv', 'metadata': {}}
            headers = rows[0] if rows else []
            content = self._csv_to_text(headers, rows[1:])
            return {
                'content': content,
                'type': 'csv',
                'metadata': {
                    'rows': len(rows),
                    'columns': len(headers),
                    'headers': headers
                }
            }
        except Exception as e:
            logger.error(f"Ошибка парсинга CSV файла {file_path}: {e}")
            raise
    def _csv_to_text(self, headers: List[str], rows: List[List[str]]) -> str:

        result = []
        if headers:
            result.append("Заголовки: " + ", ".join(headers))
            result.append("")
        for i, row in enumerate(rows):
            result.append(f"Строка {i + 1}:")
            for j, value in enumerate(row):
                if j < len(headers):
                    result.append(f"  {headers[j]}: {value}")
            result.append("")
        return '\n'.join(result)
class XMLParser(DocumentParser):

    def __init__(self):
        super().__init__()
        self.supported_extensions = ['.xml']
    def parse(self, file_path: str) -> Dict[str, Any]:
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()
            content = self._xml_to_text(root)
            return {
                'content': content,
                'type': 'xml',
                'metadata': {
                    'root_tag': root.tag,
                    'elements': len(list(root.iter()))
                }
            }
        except Exception as e:
            logger.error(f"Ошибка парсинга XML файла {file_path}: {e}")
            raise
    def _xml_to_text(self, element: ET.Element, indent: int = 0) -> str:

        result = []
        tag_line = f"{'  ' * indent}<{element.tag}"
        if element.attrib:
            attrs = " ".join([f'{k}="{v}"' for k, v in element.attrib.items()])
            tag_line += f" {attrs}"
        if element.text and element.text.strip() and len(element) == 0:
            tag_line += f">{element.text.strip()}</{element.tag}>"
            result.append(tag_line)
        else:
            tag_line += ">"
            result.append(tag_line)
            for child in element:
                result.append(self._xml_to_text(child, indent + 1))
            result.append(f"{'  ' * indent}</{element.tag}>")
        return '\n'.join(result)
class HTMLParser(DocumentParser):

    def __init__(self):
        super().__init__()
        self.supported_extensions = ['.html', '.htm']
    def parse(self, file_path: str) -> Dict[str, Any]:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            cleaned_content = self._clean_html(content)
            return {
                'content': cleaned_content,
                'type': 'html',
                'metadata': {
                    'original_length': len(content),
                    'cleaned_length': len(cleaned_content)
                }
            }
        except Exception as e:
            logger.error(f"Ошибка парсинга HTML файла {file_path}: {e}")
            raise
    def _clean_html(self, html_content: str) -> str:

        import re
        html_content = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL)
        html_content = re.sub(r'<style[^>]*>.*?</style>', '', html_content, flags=re.DOTALL)
        html_content = re.sub(r'<a\s+[^>]*href=["\']([^"\']+)["\'][^>]*>([^<]+)</a>', r'\2 (\1)', html_content)
        html_content = re.sub(r'<[^>]+>', '', html_content)
        html_content = re.sub(r'[ \t]+', ' ', html_content)
        html_content = re.sub(r'\n{3,}', '\n\n', html_content)
        return html_content.strip()
class PDFParser(DocumentParser):

    def __init__(self):
        super().__init__()
        self.supported_extensions = ['.pdf']
    def parse(self, file_path: str) -> Dict[str, Any]:
        try:
            import fitz
            import re
            doc = fitz.open(file_path)
            content = []
            for page_num in range(len(doc)):
                page = doc[page_num]
                text = page.get_text()
                links = page.get_links()
                link_map = {}
                for link in links:
                    if 'uri' in link:
                        uri = link['uri']
                        rect = link.get('from', None)
                        if rect:
                            link_text = page.get_textbox(rect).strip()
                            if link_text:
                                link_map[link_text] = uri
                for link_text, url in link_map.items():
                    if link_text in text:
                        text = text.replace(link_text, f"{link_text} ({url})")
                text = re.sub(r'\s*Создано в (OfficeSuite|Microsoft Word|LibreOffice|Google Docs|WPS Office)\s*Страница.*?(\n|$)', '', text, flags=re.IGNORECASE)
                text = re.sub(r'\s*Created with (OfficeSuite|Microsoft Word|LibreOffice|Google Docs|WPS Office)\s*Page.*?(\n|$)', '', text, flags=re.IGNORECASE)
                text = re.sub(r'\s*Generated by.*?(\n|$)', '', text, flags=re.IGNORECASE)
                text = re.sub(r'[ \t]+', ' ', text)
                text = re.sub(r'\n{3,}', '\n\n', text)
                text = text.strip()
                if text:
                    content.append(f"Страница {page_num + 1}:\n{text}")
            page_count = len(doc)
            doc.close()
            full_content = '\n\n'.join(content)
            return {
                'content': full_content,
                'type': 'pdf',
                'metadata': {
                    'pages': page_count,
                    'words': len(full_content.split()),
                    'characters': len(full_content)
                }
            }
        except ImportError:
            logger.warning("PyMuPDF не установлен, используется PyPDF2 без извлечения ссылок")
            return self._parse_with_pypdf2(file_path)
        except Exception as e:
            logger.error(f"Ошибка парсинга PDF файла {file_path}: {e}")
            raise
    def _parse_with_pypdf2(self, file_path: str) -> Dict[str, Any]:

        import re
        with open(file_path, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            content = []
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text = page.extract_text()
                if text:
                    text = re.sub(r'[ \t]+', ' ', text)
                    text = re.sub(r'\n{3,}', '\n\n', text)
                    text = text.strip()
                    content.append(f"Страница {page_num + 1}:\n{text}")
            full_content = '\n\n'.join(content)
            return {
                'content': full_content,
                'type': 'pdf',
                'metadata': {
                    'pages': len(pdf_reader.pages),
                    'words': len(full_content.split()),
                    'characters': len(full_content)
                }
            }
class DOCXParser(DocumentParser):

    def __init__(self):
        super().__init__()
        self.supported_extensions = ['.docx']
    def _extract_hyperlinks(self, paragraph) -> str:

        try:
            ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
            result_parts = []
            for child in paragraph._element:
                if child.tag.endswith('}r'):
                    for t in child.findall('.//w:t', ns):
                        if t.text:
                            result_parts.append(t.text)
                elif child.tag.endswith('}hyperlink'):
                    try:
                        rId = child.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id')
                        if rId and hasattr(paragraph.part, 'rels') and rId in paragraph.part.rels:
                            url = paragraph.part.rels[rId].target_ref
                            if url.startswith('mailto:'):
                                link_text = ''.join(t.text for t in child.findall('.//w:t', ns) if t.text)
                                if link_text:
                                    result_parts.append(link_text)
                            else:
                                link_text = ''.join(t.text for t in child.findall('.//w:t', ns) if t.text)
                                if link_text:
                                    result_parts.append(f"{link_text} ({url})")
                        else:
                            link_text = ''.join(t.text for t in child.findall('.//w:t', ns) if t.text)
                            if link_text:
                                result_parts.append(link_text)
                    except Exception as e:
                        logger.debug(f"Ошибка обработки гиперссылки: {e}")
            return ''.join(result_parts) if result_parts else paragraph.text
        except Exception as e:
            logger.debug(f"Ошибка извлечения гиперссылок: {e}")
            return paragraph.text
    def parse(self, file_path: str) -> Dict[str, Any]:
        try:
            doc = Document(file_path)
            content = []
            for paragraph in doc.paragraphs:
                if paragraph.text.strip():
                    para_text = self._extract_hyperlinks(paragraph)
                    content.append(para_text)
            for table in doc.tables:
                for row in table.rows:
                    row_text = []
                    for cell in row.cells:
                        if cell.text.strip():
                            row_text.append(cell.text.strip())
                    if row_text:
                        content.append(" | ".join(row_text))
            full_content = '\n'.join(content)
            import re
            full_content = re.sub(r'\s*Создано в (OfficeSuite|Microsoft Word|LibreOffice|Google Docs|WPS Office)\s*Страница.*?(\n|$)', '', full_content, flags=re.IGNORECASE)
            full_content = re.sub(r'\s*Created with (OfficeSuite|Microsoft Word|LibreOffice|Google Docs|WPS Office)\s*Page.*?(\n|$)', '', full_content, flags=re.IGNORECASE)
            full_content = re.sub(r'\s*Generated by.*?(\n|$)', '', full_content, flags=re.IGNORECASE)
            full_content = full_content.strip()
            return {
                'content': full_content,
                'type': 'docx',
                'metadata': {
                    'paragraphs': len(doc.paragraphs),
                    'tables': len(doc.tables),
                    'words': len(full_content.split()),
                    'characters': len(full_content)
                }
            }
        except Exception as e:
            logger.error(f"Ошибка парсинга DOCX файла {file_path}: {e}")
            raise
class XLSParser(DocumentParser):

    def __init__(self):
        super().__init__()
        self.supported_extensions = ['.xlsx']
    def parse(self, file_path: str) -> Dict[str, Any]:
        try:
            workbook = openpyxl.load_workbook(file_path, data_only=True)
            content = []
            sheet_count = 0
            for sheet_name in workbook.sheetnames:
                sheet = workbook[sheet_name]
                content.append(f"Лист: {sheet_name}")
                for row in sheet.iter_rows(values_only=True):
                    row_data = []
                    for cell in row:
                        if cell is not None and str(cell).strip():
                            row_data.append(str(cell).strip())
                    if row_data:
                        content.append(" | ".join(row_data))
                content.append("")
                sheet_count += 1
            full_content = '\n'.join(content)
            return {
                'content': full_content,
                'type': 'xlsx',
                'metadata': {
                    'sheets': sheet_count,
                    'sheet_names': workbook.sheetnames,
                    'words': len(full_content.split()),
                    'characters': len(full_content)
                }
            }
        except Exception as e:
            logger.error(f"Ошибка парсинга Excel файла {file_path}: {e}")
            raise
class XLSLegacyParser(DocumentParser):

    def __init__(self):
        super().__init__()
        self.supported_extensions = ['.xls']
    def parse(self, file_path: str) -> Dict[str, Any]:
        try:
            book = xlrd.open_workbook(file_path)
            content = []
            for sheet in book.sheets():
                content.append(f"Лист: {sheet.name}")
                for rx in range(sheet.nrows):
                    row_vals = [str(sheet.cell_value(rx, cx)).strip() for cx in range(sheet.ncols) if str(sheet.cell_value(rx, cx)).strip()]
                    if row_vals:
                        content.append(" | ".join(row_vals))
                content.append("")
            full_content = '\n'.join(content)
            return {
                'content': full_content,
                'type': 'xls',
                'metadata': {
                    'sheets': len(book.sheets()),
                    'sheet_names': [s.name for s in book.sheets()],
                    'words': len(full_content.split()),
                    'characters': len(full_content)
                }
            }
        except Exception as e:
            logger.error(f"Ошибка парсинга XLS файла {file_path}: {e}")
            raise
class RTFParser(DocumentParser):

    def __init__(self):
        super().__init__()
        self.supported_extensions = ['.rtf']
    def parse(self, file_path: str) -> Dict[str, Any]:
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                rtf_content = f.read()
            content = rtf_to_text(rtf_content)
            return {
                'content': content,
                'type': 'rtf',
                'metadata': {
                    'words': len(content.split()),
                    'characters': len(content)
                }
            }
        except Exception as e:
            logger.error(f"Ошибка парсинга RTF файла {file_path}: {e}")
            raise
class ODTParser(DocumentParser):

    def __init__(self):
        super().__init__()
        self.supported_extensions = ['.odt']
    def parse(self, file_path: str) -> Dict[str, Any]:
        try:
            doc = odf_load(file_path)
            content = teletype.extractText(doc)
            return {
                'content': content,
                'type': 'odt',
                'metadata': {
                    'words': len(content.split()),
                    'characters': len(content)
                }
            }
        except Exception as e:
            logger.error(f"Ошибка парсинга ODT файла {file_path}: {e}")
            raise
class PPTXParser(DocumentParser):

    def __init__(self):
        super().__init__()
        self.supported_extensions = ['.pptx']
    def parse(self, file_path: str) -> Dict[str, Any]:
        try:
            prs = Presentation(file_path)
            content_lines: List[str] = []
            for i, slide in enumerate(prs.slides, start=1):
                content_lines.append(f"Слайд {i}:")
                for shape in slide.shapes:
                    if getattr(shape, 'has_text_frame', False):
                        text = '\n'.join([p.text for p in shape.text_frame.paragraphs if p.text.strip()])
                        if text:
                            content_lines.append(text)
                content_lines.append("")
            full_content = '\n'.join(content_lines)
            return {
                'content': full_content,
                'type': 'pptx',
                'metadata': {
                    'slides': len(prs.slides),
                    'words': len(full_content.split()),
                    'characters': len(full_content)
                }
            }
        except Exception as e:
            logger.error(f"Ошибка парсинга PPTX файла {file_path}: {e}")
            raise
class LegacyOfficeParser(DocumentParser):

    def __init__(self):
        super().__init__()
        self.supported_extensions = ['.doc', '.ppt']
    def parse(self, file_path: str) -> Dict[str, Any]:
        try:
            raw = textract.process(file_path)
            content = raw.decode('utf-8', errors='ignore')
            ftype = os.path.splitext(file_path)[1].lower().lstrip('.')
            return {
                'content': content,
                'type': ftype,
                'metadata': {
                    'words': len(content.split()),
                    'characters': len(content)
                }
            }
        except Exception as e:
            logger.error(f"Ошибка парсинга legacy Office файла {file_path}: {e}")
            raise
class EPUBParser(DocumentParser):

    def __init__(self):
        super().__init__()
        self.supported_extensions = ['.epub']
    def parse(self, file_path: str) -> Dict[str, Any]:
        try:
            book = epub.read_epub(file_path)
            parts: List[str] = []
            for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
                html = item.get_content().decode('utf-8', errors='ignore')
                text = BeautifulSoup(html, 'lxml').get_text(separator=' ', strip=True)
                if text:
                    parts.append(text)
            content = '\n'.join(parts)
            return {
                'content': content,
                'type': 'epub',
                'metadata': {
                    'words': len(content.split()),
                    'characters': len(content)
                }
            }
        except Exception as e:
            logger.error(f"Ошибка парсинга EPUB файла {file_path}: {e}")
            raise
class MobiAzwParser(DocumentParser):

    def __init__(self):
        super().__init__()
        self.supported_extensions = ['.mobi', '.azw', '.azw3']
    def parse(self, file_path: str) -> Dict[str, Any]:
        try:
            raw = textract.process(file_path)
            content = raw.decode('utf-8', errors='ignore')
            return {
                'content': content,
                'type': 'ebook',
                'metadata': {
                    'words': len(content.split()),
                    'characters': len(content)
                }
            }
        except Exception as e:
            logger.error(f"Ошибка парсинга MOBI/AZW файла {file_path}: {e}")
            raise
class DocumentParserManager:

    def __init__(self):
        self.parsers = [
            TextParser(),
            JSONParser(),
            CSVParser(),
            XMLParser(),
            HTMLParser(),
            PDFParser(),
            DOCXParser(),
            XLSParser(),
            XLSLegacyParser(),
            RTFParser(),
            ODTParser(),
            PPTXParser(),
            LegacyOfficeParser(),
            EPUBParser(),
            MobiAzwParser()
        ]
    def detect_file_type_by_content(self, file_path: str) -> str:

        try:
            with open(file_path, 'rb') as f:
                head = f.read(8)
            if head.startswith(b'%PDF'):
                return '.pdf'
            elif head.startswith(b'PK\x03\x04'):
                with open(file_path, 'rb') as f:
                    content = f.read(4096)
                if b'word/' in content:
                    return '.docx'
                elif b'xl/' in content:
                    return '.xlsx'
                elif b'ppt/' in content:
                    return '.pptx'
                elif b'OEBPS' in content or b'epub' in content:
                    return '.epub'
                return '.zip'
            elif head.startswith(b'\xd0\xcf\x11\xe0'):
                return '.doc'
            elif head.startswith(b'{') or head.startswith(b'['):
                return '.json'
            elif head.startswith(b'{\\rtf'):
                return '.rtf'
            else:
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        sample = f.read(512)
                    if '<html' in sample.lower() or '<!doctype' in sample.lower():
                        return '.html'
                    elif '<?xml' in sample:
                        return '.xml'
                    else:
                        return '.txt'
                except:
                    return ''
        except Exception as e:
            logger.error(f"Ошибка определения типа файла {file_path}: {e}")
            return ''
    def parse_document(self, file_path: str) -> Dict[str, Any]:

        file_ext = os.path.splitext(file_path)[1].lower()
        if file_ext == '.doc':
            detected_ext = self.detect_file_type_by_content(file_path)
            if detected_ext == '.docx':
                logger.info(f"Файл {file_path} определен как DOCX по содержимому")
                for parser in self.parsers:
                    if isinstance(parser, DOCXParser):
                        return parser.parse(file_path)
            else:
                logger.info(f"Файл {file_path} определен как старый формат DOC, используем LegacyOfficeParser")
                for parser in self.parsers:
                    if isinstance(parser, LegacyOfficeParser):
                        return parser.parse(file_path)
        for parser in self.parsers:
            if parser.can_parse(file_path):
                logger.info(f"Найден парсер по расширению для {file_path}")
                return parser.parse(file_path)
        logger.info(f"Не найден парсер по расширению для {file_path}, определяем тип по содержимому")
        detected_ext = self.detect_file_type_by_content(file_path)
        if detected_ext:
            logger.info(f"Определен тип файла {file_path} как {detected_ext}")
            temp_name = file_path + detected_ext
            for parser in self.parsers:
                if parser.can_parse(temp_name):
                    logger.info(f"Найден парсер {parser.__class__.__name__} для типа {detected_ext}")
                    return parser.parse(file_path)
        raise ValueError(f"Нет парсера для файла {file_path}")
    def get_supported_extensions(self) -> List[str]:

        extensions = []
        for parser in self.parsers:
            extensions.extend(parser.supported_extensions)
        return extensions
parser_manager = DocumentParserManager()