import importlib
from collections import defaultdict
import re
import time

__all__ = ['GetWebsite']


class GetWebsite():
    dependencies = ["requests", "beautifulsoup4==4.13.3"]

    inputSchema = {
        "name": "GetWebsite",
        "description": "Returns the content of a website with enhanced error handling and output options.",
        "parameters": {
            "type": "object",
            "properties": {
                "url": {
                    "type": "string",
                    "description": "The URL of the website to fetch content from.",
                },
                "output_type": {
                    "type": "string",
                    "enum": ["summary", "full_text", "html"],
                    "description": "The type of output to return. 'summary' returns a summary of the text, 'full_text' returns the full text content, and 'html' returns the raw HTML content.",
                    "default": "summary"
                },
                "css_selector": {
                    "type": "string",
                    "description": "A CSS selector to extract specific content from the page.",
                }
            },
            "required": ["url"],
        }
    }

    def summarize_text(self, text):
        # Clean the text more thoroughly
        text = re.sub(r'\[[0-9]*\]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^a-zA-Z0-9.\s]', '', text)  # Remove special characters except periods

        # Tokenize into sentences
        sentences = re.split(r'(?<=[.!?])\s+', text)
        sentences = [s.strip() for s in sentences if s]

        # Calculate word frequencies
        word_frequencies = defaultdict(int)
        for sentence in sentences:
            words = sentence.lower().split()
            for word in words:
                word_frequencies[word] += 1

        # Normalize word frequencies
        total_words = sum(word_frequencies.values())
        if total_words > 0:
            for word in word_frequencies:
                word_frequencies[word] /= total_words

        # Calculate sentence scores based on word frequencies, sentence length, and coherence
        sentence_scores = {}
        for i, sentence in enumerate(sentences):
            score = 0
            words = sentence.lower().split()
            for word in words:
                score += word_frequencies[word]

            # Consider sentence length
            sentence_length_factor = 1 - abs(len(words) - 15) / 15  # Prefer sentences around 15 words
            score += sentence_length_factor * 0.1

            # Add a coherence score
            if i > 0 and sentences[i - 1] in sentence_scores:
                previous_sentence_words = sentences[i - 1].lower().split()
                common_words = set(words) & set(previous_sentence_words)
                coherence_score = len(common_words) / len(words)
                score += coherence_score * 0.1

            sentence_scores[sentence] = score

        # Get the top 3 sentences with the highest scores
        ranked_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:3]

        # Generate the summary
        summary = ". ".join(ranked_sentences) + "."
        return summary

    def run(self, **kwargs):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'DNT': '1',
            'Sec-GPC': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Priority': 'u=0, i',
        }
        print("Running enhanced web scraper")

        url = kwargs.get("url")
        output_type = kwargs.get("output_type", "summary")
        css_selector = kwargs.get("css_selector")

        if not url:
            return {
                "status": "error",
                "message": "Missing required parameters: 'url'",
                "output": None
            }

        output = None
        requests = importlib.import_module("requests")
        bs4 = importlib.import_module("bs4")
        BeautifulSoup = bs4.BeautifulSoup
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
            response.encoding = response.apparent_encoding  # Handle encoding
            if output_type == "html":
                # Return the raw HTML content
                return {
                    "status": "success",
                    "message": "Search completed successfully",
                    "output": response.text,
                }

            # Parse the content using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')

            if css_selector:
                # Extract text from the selected elements
                elements = soup.select(css_selector)
                text = ('\n'.join([element.get_text() for element in elements]))
                text = text.encode('utf-8', 'ignore').decode('utf-8')
            else:
                # Extract text from the parsed HTML
                text = soup.get_text()
                text = text.encode('utf-8', 'ignore').decode('utf-8')

            if output_type == "summary":
                # Summarize the text
                output = self.summarize_text(text)
            elif output_type == "full_text":
                output = text
            else:
                return {
                    "status": "error",
                    "message": f"Invalid output_type: {output_type}",
                    "output": None
                }


            return {
                "status": "success",
                "message": "Search completed successfully",
                "output": output,
            }
        except requests.exceptions.RequestException as e:
            return {
                "status": "error",
                "message": f"Request failed: {str(e)}",
                "output": None
            }
        except Exception as e:
            return {
                "status": "error",
                "message": str(e),
                "output": None
            }