import os
import sys
import json
import logging
import requests
import PyPDF2
from pathlib import Path
from typing import List, Dict, Any
import time
from document_parsers import parser_manager
from embedding_service import embedding_service
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
CHROMADB_BASE_URL = os.getenv('CHROMADB_BASE_URL', '')
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', '')
EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'jeffh/intfloat-multilingual-e5-large-instruct:q8_0')
DATA_DIR = '/opt/autogen/data'
CHUNK_SIZE = 1000
class DocumentProcessor:

    def __init__(self):
        self.chromadb_url = CHROMADB_BASE_URL
        self.ollama_url = OLLAMA_BASE_URL.rstrip('/') if OLLAMA_BASE_URL else ''
        self.embedding_model = EMBEDDING_MODEL
        self.data_dir = Path(DATA_DIR)
        self.logger = logger
        self.load_parser_settings()
    def load_parser_settings(self):

        try:
            settings_file = Path('parser_settings.json')
            if settings_file.exists():
                with open(settings_file, 'r', encoding='utf-8') as f:
                    settings = json.load(f)
                    self.chunk_size = settings.get('chunk_size', 1000)
                    self.chunk_overlap = settings.get('chunk_overlap', 200)
                    self.embedding_model = settings.get('embedding_model', self.embedding_model)
                    self.remove_extra_whitespace = settings.get('remove_extra_whitespace', True)
                    self.normalize_unicode = settings.get('normalize_unicode', True)
                    self.remove_special_chars = settings.get('remove_special_chars', False)
                    self.lowercase_text = settings.get('lowercase_text', False)
                    self.logger.info(f"Загружены настройки парсера: chunk_size={self.chunk_size}, chunk_overlap={self.chunk_overlap}, embedding_model={self.embedding_model}")
                    self.logger.info(f"Настройки предобработки: remove_extra_whitespace={self.remove_extra_whitespace}, normalize_unicode={self.normalize_unicode}, remove_special_chars={self.remove_special_chars}, lowercase_text={self.lowercase_text}")
            else:
                self.chunk_size = 1000
                self.chunk_overlap = 200
                self.remove_extra_whitespace = True
                self.normalize_unicode = True
                self.remove_special_chars = False
                self.lowercase_text = False
                self.logger.warning("Файл parser_settings.json не найден, используются значения по умолчанию")
        except Exception as e:
            self.logger.error(f"Ошибка загрузки настроек парсера: {e}")
            self.chunk_size = 1000
            self.chunk_overlap = 200
            self.remove_extra_whitespace = True
            self.normalize_unicode = True
            self.remove_special_chars = False
            self.lowercase_text = False
    def extract_text_from_pdf(self, pdf_path: Path) -> str:

        try:
            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                content = []
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    text = page.extract_text()
                    if text:
                        import re
                        text = re.sub(r'\s+', ' ', text)
                        text = text.strip()
                        content.append(f"Страница {page_num + 1}:\n{text}")
                full_content = '\n\n'.join(content)
                self.logger.info(f"Извлечен текст из {pdf_path.name}: {len(full_content)} символов")
                return full_content
        except Exception as e:
            self.logger.error(f"Ошибка извлечения текста из {pdf_path}: {e}")
            return ""
    def chunk_text(self, text: str, chunk_size: int = None, chunk_overlap: int = None) -> List[str]:

        if chunk_size is None:
            chunk_size = self.chunk_size
        if chunk_overlap is None:
            chunk_overlap = self.chunk_overlap
        chunks = []
        start = 0
        text_length = len(text)
        while start < text_length:
            end = start + chunk_size
            chunk = text[start:end]
            if chunk.strip():
                chunks.append(chunk)
            start = start + chunk_size - chunk_overlap
        self.logger.info(f"Текст разбит на {len(chunks)} чанков (chunk_size={chunk_size}, overlap={chunk_overlap})")
        return chunks
    def preprocess_text(self, text: str) -> str:

        if not text:
            return text
        if self.remove_extra_whitespace:
            import re
            text = re.sub(r'[ \t]+', ' ', text)
            text = re.sub(r'\n{3,}', '\n\n', text)
            text = text.strip()
        if self.normalize_unicode:
            import unicodedata
            text = unicodedata.normalize('NFC', text)
        if self.remove_special_chars:
            import re
            text = re.sub(r'[^\w\s\n\r.,;:!?()\-—–«»""''`]', '', text, flags=re.UNICODE)
        if self.lowercase_text:
            text = text.lower()
        return text
    def create_embedding(self, text: str) -> List[float]:

        preprocessed_text = self.preprocess_text(text)
        return embedding_service.create_embedding(preprocessed_text)
    def create_collection(self) -> bool:

        try:
            response = requests.post(
                f"{self.chromadb_url}/api/v1/collections",
                json={
                    "name": "default",
                    "metadata": {
                        "description": "RAG Chat API documents",
                        "created_at": time.time()
                    }
                },
                timeout=30
            )
            if response.status_code in [200, 201]:
                self.logger.info("Коллекция создана или уже существует")
                return True
            else:
                self.logger.error(f"Ошибка создания коллекции: {response.status_code}")
                return False
        except Exception as e:
            self.logger.error(f"Ошибка создания коллекции: {e}")
            return False
    def add_documents_to_chromadb(self, documents: List[Dict[str, Any]]) -> bool:

        try:
            from chroma_client import chroma_client
            ids = [doc['id'] for doc in documents]
            texts = [doc['text'] for doc in documents]
            metadatas = [doc['metadata'] for doc in documents]
            result = chroma_client.add_documents(texts, metadatas, ids)
            if result:
                self.logger.info(f"Добавлено {len(documents)} документов в ChromaDB")
                return True
            else:
                self.logger.error("Ошибка добавления документов в ChromaDB")
                return False
        except Exception as e:
            self.logger.error(f"Ошибка добавления документов: {e}")
            return False
    def process_text_file(self, text_path: Path) -> List[Dict[str, Any]]:

        self.logger.info(f"Обработка текстового файла: {text_path.name}")
        try:
            with open(text_path, 'r', encoding='utf-8') as f:
                text = f.read()
            if not text.strip():
                self.logger.warning(f"Файл {text_path.name} пуст")
                return []
            chunks = self.chunk_text(text)
            documents = []
            for i, chunk in enumerate(chunks):
                doc_id = f"{text_path.stem}_chunk_{i}"
                document = {
                    'id': doc_id,
                    'text': chunk,
                    'metadata': {
                        'source': text_path.name,
                        'chunk_index': i,
                        'total_chunks': len(chunks),
                        'file_type': 'text'
                    }
                }
                documents.append(document)
            self.logger.info(f"Подготовлено {len(documents)} чанков для батчевой обработки")
            return documents
        except Exception as e:
            self.logger.error(f"Ошибка обработки текстового файла {text_path.name}: {e}")
            return []
    def process_parsed_content(self, content: str, source_name: str, file_type: str) -> List[Dict[str, Any]]:

        if not content or not content.strip():
            return []
        chunks = self.chunk_text(content)
        documents: List[Dict[str, Any]] = []
        for i, chunk in enumerate(chunks):
            doc_id = f"{Path(source_name).stem}_chunk_{i}"
            documents.append({
                'id': doc_id,
                'text': chunk,
                'metadata': {
                    'source': source_name,
                    'chunk_index': i,
                    'total_chunks': len(chunks),
                    'file_type': file_type
                }
            })
        self.logger.info(f"Подготовлено {len(documents)} чанков для батчевой обработки")
        return documents
    def process_any_file(self, path: Path) -> List[Dict[str, Any]]:

        self.logger.info(f"Обработка файла (generic): {path.name}")
        try:
            parsed = parser_manager.parse_document(str(path))
            content = parsed.get('content', '')
            ftype = parsed.get('type', path.suffix.lower().lstrip('.'))
            return self.process_parsed_content(content, path.name, ftype)
        except Exception as e:
            self.logger.error(f"Ошибка generic-обработки {path}: {e}")
            return []
    def process_pdf_file(self, pdf_path: Path) -> List[Dict[str, Any]]:

        self.logger.info(f"Обработка файла: {pdf_path.name}")
        text = self.extract_text_from_pdf(pdf_path)
        if not text:
            return []
        chunks = self.chunk_text(text)
        documents = []
        for i, chunk in enumerate(chunks):
            doc_id = f"{pdf_path.stem}_{i}"
            document = {
                'id': doc_id,
                'text': chunk,
                'metadata': {
                    'source': pdf_path.name,
                    'chunk_index': i,
                    'total_chunks': len(chunks),
                    'file_type': 'pdf'
                }
            }
            documents.append(document)
        self.logger.info(f"Подготовлено {len(documents)} чанков для батчевой обработки")
        return documents
    def process_all_documents(self) -> bool:

        if not self.data_dir.exists():
            self.logger.error(f"Директория {self.data_dir} не существует")
            return False
        pdf_files = list(self.data_dir.glob("*.pdf"))
        if not pdf_files:
            self.logger.error(f"PDF файлы не найдены в {self.data_dir}")
            return False
        self.logger.info(f"Найдено {len(pdf_files)} PDF файлов")
        if not self.create_collection():
            return False
        all_documents = []
        for pdf_file in pdf_files:
            documents = self.process_pdf_file(pdf_file)
            all_documents.extend(documents)
        if not all_documents:
            self.logger.error("Не удалось обработать ни одного документа")
            return False
        return self.add_documents_to_chromadb(all_documents)
    def check_services(self) -> bool:

        ollama_available = False
        fallback_used = False
        fallback_ollama_url = os.getenv('OLLAMA_FALLBACK_URL', '')
        ollama_urls = [
            self.ollama_url.rstrip('/') if self.ollama_url else None,
            fallback_ollama_url.rstrip('/') if fallback_ollama_url else None,
            'http://172.17.0.1:11434',
            'http://host.docker.internal:11434'
        ]
        ollama_urls = [url for url in ollama_urls if url and url != self.ollama_url.rstrip('/')]
        if self.ollama_url:
            ollama_urls.insert(0, self.ollama_url.rstrip('/'))
        for ollama_url in ollama_urls:
            try:
                self.logger.info(f"Проверка Ollama: {ollama_url}")
                response = requests.get(f"{ollama_url}/api/tags", timeout=5)
                if response.status_code == 200:
                    ollama_available = True
                    if ollama_url != self.ollama_url.rstrip('/'):
                        self.ollama_url = ollama_url
                        fallback_used = True
                        self.logger.warning(f"Основной Ollama недоступен, используется fallback: {ollama_url}")
                    else:
                        self.logger.info(f"Ollama доступен: {ollama_url}")
                    break
            except Exception as e:
                self.logger.debug(f"Ollama недоступен на {ollama_url}: {e}")
                continue
        if not ollama_available:
            self.logger.error("Ollama недоступен ни на одном из URL (основной и fallback)")
            return False
        if fallback_used:
            os.environ['OLLAMA_BASE_URL'] = self.ollama_url
            self.logger.info(f"OLLAMA_BASE_URL обновлен на fallback: {self.ollama_url}")
        try:
            from chroma_client import chroma_client
            chroma_client.client.list_collections()
        except Exception as e:
            self.logger.error(f"Ошибка подключения к ChromaDB: {e}")
            return False
        self.logger.info("Все сервисы доступны")
        return True
def process_single_document(file_path: str, category: str = 'general') -> bool:

    logger.info(f"Обработка документа: {file_path}")
    processor = DocumentProcessor()
    if not processor.check_services():
        logger.error("Не все сервисы доступны")
        return False
    try:
        path = Path(file_path)
        documents = processor.process_any_file(path)
        if documents:
            for doc in documents:
                doc['metadata']['category'] = category
            return processor.add_documents_to_chromadb(documents)
        else:
            logger.error(f"Не удалось обработать документ: {file_path}")
            return False
    except Exception as e:
        logger.error(f"Ошибка обработки документа {file_path}: {e}")
        return False
def main():

    logger.info("Запуск обработки документов...")
    processor = DocumentProcessor()
    if not processor.check_services():
        logger.error("Не все сервисы доступны")
        sys.exit(1)
    if processor.process_all_documents():
        logger.info("Обработка документов завершена успешно")
    else:
        logger.error("Ошибка обработки документов")
        sys.exit(1)
if __name__ == "__main__":
    main()