import os
import re
import logging
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Any, Set, Optional
import time
from pathlib import Path
import json
logger = logging.getLogger(__name__)
class WebsiteCrawler:

    def __init__(self, base_url: str, max_pages: int = 100, delay: float = 1.0, check_domain: bool = True, use_llm: bool = True):

        self.base_url = base_url.rstrip('/')
        self.max_pages = max_pages
        self.delay = delay
        self._check_domain = check_domain
        self.use_llm = use_llm
        self.visited_urls: Set[str] = set()
        self.pages_content: List[Dict[str, Any]] = []
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        if self.use_llm:
            self.ollama_url = os.getenv('OLLAMA_BASE_URL', '').strip().rstrip('/')
            if ',' in self.ollama_url:
                self.ollama_url = self.ollama_url.split(',')[0].strip().rstrip('/')
            self.llm_model = os.getenv('MODEL_NAME', 'gpt-oss:latest')
            self.llm_timeout = int(os.getenv('TIMEOUT', '600'))
            self.llm_num_ctx = int(os.getenv('NUM_CTX', '128000'))
            logger.info(f"LLM обработка включена: {self.ollama_url}, модель: {self.llm_model}, контекст: {self.llm_num_ctx}")
    def is_valid_url(self, url: str) -> bool:

        parsed = urlparse(url)
        if not parsed.scheme or not parsed.netloc:
            return False
        if parsed.path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.svg', '.pdf', '.zip', '.rar', '.doc', '.docx', '.xls', '.xlsx')):
            return False
        if parsed.scheme not in ('http', 'https'):
            return False
        if hasattr(self, '_check_domain') and self._check_domain:
            base_domain = urlparse(self.base_url).netloc
            return parsed.netloc == base_domain
        return True
    def clean_text(self, text: str) -> str:

        if not text:
            return ""
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()
        return text
    def process_with_llm(self, raw_html: str, url: str) -> Optional[Dict[str, Any]]:

        if not self.use_llm or not self.ollama_url:
            return None
        try:
            html_preview = raw_html[:100000] if len(raw_html) > 100000 else raw_html
            logger.debug(f"Отправка HTML в LLM: {len(html_preview)} символов из {len(raw_html)}")
            prompt = f
            request_options = {
                "temperature": 0.0,
                "num_ctx": self.llm_num_ctx,
                "num_predict": 20000,
                "top_p": 0.9,
                "repeat_penalty": 1.1,
                "stop": ["\n\n\n", "```", "JSON:", "Вот", "Ответ:", "Объяснение:"]
            }
            logger.debug(f"Отправка запроса к LLM с num_ctx={self.llm_num_ctx}, длина промпта={len(prompt)} символов")
            request_payload = {
                "model": self.llm_model,
                "prompt": prompt,
                "stream": False,
                "think": False,
                "options": request_options
            }
            response = self.session.post(
                f"{self.ollama_url}/api/generate",
                json=request_payload,
                timeout=self.llm_timeout
            )
            if response.status_code == 200:
                result = response.json()
                llm_response = result.get('response', '').strip()
                llm_response_clean = llm_response.strip()
                if llm_response_clean.startswith('```'):
                    llm_response_clean = re.sub(r'```json\s*', '', llm_response_clean)
                    llm_response_clean = re.sub(r'```\s*', '', llm_response_clean)
                    llm_response_clean = llm_response_clean.strip()
                json_start = llm_response_clean.find('{')
                json_end = llm_response_clean.rfind('}')
                if json_start != -1 and json_end != -1 and json_end > json_start:
                    json_str = llm_response_clean[json_start:json_end+1]
                    try:
                        llm_data = json.loads(json_str)
                        content = llm_data.get('content', '')
                        if llm_data.get('key_facts'):
                            content += '\n\nКлючевые факты:\n' + '\n'.join(f"- {fact}" for fact in llm_data['key_facts'])
                        if llm_data.get('summary') and len(content) > 2000:
                            content += f"\n\nРезюме: {llm_data['summary']}"
                        return {
                            'url': url,
                            'title': llm_data.get('title', ''),
                            'content': content,
                            'meta_description': llm_data.get('meta_description', ''),
                            'category': llm_data.get('category', ''),
                            'text_length': len(content),
                            'processed_by_llm': True
                        }
                    except json.JSONDecodeError as e:
                        logger.warning(f"Не удалось распарсить JSON от LLM: {e}")
                        logger.debug(f"JSON строка (первые 500 символов): {json_str[:500]}")
                        json_str_fixed = json_str
                        try:
                            json_str_fixed = re.sub(r',\s*}', '}', json_str_fixed)
                            json_str_fixed = re.sub(r',\s*]', ']', json_str_fixed)
                            json_str_fixed = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', json_str_fixed)
                            if json_str_fixed.count('{') > json_str_fixed.count('}'):
                                missing = json_str_fixed.count('{') - json_str_fixed.count('}')
                                json_str_fixed += '}' * missing
                            llm_data = json.loads(json_str_fixed)
                            logger.info("JSON исправлен и успешно распарсен после автоисправления")
                        except json.JSONDecodeError as e2:
                            logger.debug(f"Автоисправление не помогло: {e2}")
                            logger.debug(f"Исправленная JSON строка (первые 500 символов): {json_str_fixed[:500]}")
                            return None
            else:
                logger.warning(f"Ошибка LLM обработки: {response.status_code}")
                return None
        except Exception as e:
            logger.warning(f"Ошибка LLM обработки для {url}: {e}")
            return None
    def extract_text_from_html(self, html: str, url: str) -> Optional[Dict[str, Any]]:

        if self.use_llm:
            llm_result = self.process_with_llm(html, url)
            if llm_result and llm_result.get('text_length', 0) > 0:
                logger.info(f"Страница {url} обработана через LLM, длина контента: {llm_result.get('text_length', 0)}")
                return llm_result
            else:
                logger.info(f"LLM обработка не удалась для {url} или вернула пустой контент, используем стандартный парсинг")
        try:
            soup = BeautifulSoup(html, 'html.parser')
            for script in soup(["script", "style", "nav", "footer", "header"]):
                script.decompose()
            title = ""
            if soup.title:
                title = self.clean_text(soup.title.get_text())
            elif soup.find('h1'):
                title = self.clean_text(soup.find('h1').get_text())
            content_selector = soup.find('main') or soup.find('article') or soup.find('body')
            if content_selector:
                paragraphs = content_selector.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'td', 'th'])
                text_parts = []
                for p in paragraphs:
                    text = self.clean_text(p.get_text())
                    if text and len(text) > 10:
                        text_parts.append(text)
                content = '\n\n'.join(text_parts)
            else:
                content = self.clean_text(soup.get_text())
            meta_description = ""
            meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
            if meta_desc_tag and meta_desc_tag.get('content'):
                meta_description = self.clean_text(meta_desc_tag.get('content'))
            return {
                'url': url,
                'title': title,
                'content': content,
                'meta_description': meta_description,
                'text_length': len(content),
                'processed_by_llm': False
            }
        except Exception as e:
            logger.error(f"Ошибка извлечения текста из {url}: {e}")
            return {
                'url': url,
                'title': '',
                'content': '',
                'meta_description': '',
                'text_length': 0,
                'processed_by_llm': False
            }
    def find_links(self, html: str, current_url: str) -> List[str]:

        try:
            soup = BeautifulSoup(html, 'html.parser')
            links = []
            for link in soup.find_all('a', href=True):
                href = link['href']
                absolute_url = urljoin(current_url, href)
                absolute_url = absolute_url.split('#')[0]
                if self.is_valid_url(absolute_url) and absolute_url not in self.visited_urls:
                    links.append(absolute_url)
            return links
        except Exception as e:
            logger.error(f"Ошибка поиска ссылок на {current_url}: {e}")
            return []
    def crawl_page(self, url: str) -> List[str]:

        if url in self.visited_urls:
            return []
        if len(self.visited_urls) >= self.max_pages:
            logger.info(f"Достигнут лимит страниц ({self.max_pages})")
            return []
        try:
            logger.info(f"Скачивание: {url}")
            response = self.session.get(url, timeout=30)
            response.raise_for_status()
            content_type = response.headers.get('content-type', '').lower()
            if 'text/html' not in content_type:
                logger.debug(f"Пропуск {url}: не HTML ({content_type})")
                return []
            html = response.text
            page_data = self.extract_text_from_html(html, url)
            if not page_data:
                logger.warning(f"Не удалось извлечь данные из {url}")
                return []
            text_length = page_data.get('text_length', 0)
            if text_length < 50:
                logger.debug(f"Пропуск {url}: слишком мало контента ({text_length} символов)")
                return []
            self.visited_urls.add(url)
            self.pages_content.append(page_data)
            logger.info(f"Обработано: {url} ({text_length} символов)")
            links = self.find_links(html, url)
            return links if links else []
        except requests.exceptions.RequestException as e:
            logger.warning(f"Ошибка при скачивании {url}: {e}")
            return []
        except Exception as e:
            logger.error(f"Неожиданная ошибка при обработке {url}: {e}")
            return []
    def crawl(self) -> List[Dict[str, Any]]:

        logger.info(f"Начало обхода сайта: {self.base_url}")
        logger.info(f"Максимум страниц: {self.max_pages}, задержка: {self.delay}с")
        queue = [self.base_url]
        while queue and len(self.visited_urls) < self.max_pages:
            current_url = queue.pop(0)
            if current_url in self.visited_urls:
                continue
            links = self.crawl_page(current_url)
            for link in links:
                if link not in queue and link not in self.visited_urls:
                    queue.append(link)
            if self.delay > 0:
                time.sleep(self.delay)
        logger.info(f"Обход завершен. Обработано страниц: {len(self.visited_urls)}")
        return self.pages_content
    def parse_single_url(self, url: str) -> Dict[str, Any]:

        if not self.is_valid_url(url):
            logger.info(f"Парсинг внешней ссылки: {url}")
        try:
            logger.info(f"Парсинг страницы: {url}")
            response = self.session.get(url, timeout=30)
            response.raise_for_status()
            content_type = response.headers.get('content-type', '').lower()
            if 'text/html' not in content_type:
                logger.warning(f"Пропуск {url}: не HTML ({content_type})")
                return None
            html = response.text
            page_data = self.extract_text_from_html(html, url)
            if page_data['text_length'] < 50:
                logger.warning(f"Пропуск {url}: слишком мало контента ({page_data['text_length']} символов)")
                return None
            logger.info(f"Успешно распарсено: {url} ({page_data['text_length']} символов)")
            return page_data
        except requests.exceptions.RequestException as e:
            logger.error(f"Ошибка при парсинге {url}: {e}")
            return None
        except Exception as e:
            logger.error(f"Неожиданная ошибка при парсинге {url}: {e}")
            return None
    def parse_urls(self, urls: List[str]) -> List[Dict[str, Any]]:

        results = []
        for url in urls:
            page_data = self.parse_single_url(url)
            if page_data:
                results.append(page_data)
            if self.delay > 0:
                time.sleep(self.delay)
        return results
    def save_to_files(self, output_dir: str) -> List[str]:

        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        saved_files = []
        for i, page in enumerate(self.pages_content):
            url_path = urlparse(page['url']).path.strip('/')
            if not url_path:
                url_path = 'index'
            safe_name = re.sub(r'[^\w\-_\.]', '_', url_path)
            if len(safe_name) > 100:
                safe_name = safe_name[:100]
            filename = f"{i+1:04d}_{safe_name}.txt"
            filepath = output_path / filename
            content = f"URL: {page['url']}\n"
            content += f"Title: {page['title']}\n"
            if page['meta_description']:
                content += f"Description: {page['meta_description']}\n"
            content += f"\n{page['content']}\n"
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(content)
            saved_files.append(str(filepath))
            logger.debug(f"Сохранено: {filepath}")
        logger.info(f"Сохранено файлов: {len(saved_files)}")
        return saved_files