hashiruAI

Running

File size: 6,514 Bytes

30d98fa
25fe98a
 
 
30d98fa
5d665be
30d98fa
 
5d665be
f8d05a7
30d98fa
 
5d665be
 
30d98fa
 
 
 
 
 
 
5d665be
 
60ee681
 
 
5d665be
 
 
 
 
30d98fa
 
 
 
 
25fe98a
 
 
 
5d665be
25fe98a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d665be
 
25fe98a
 
 
 
 
 
 
 
 
 
 
 
 
30d98fa
576227b
 
 
 
 
 
 
 
 
 
 
 
 
 
5d665be
30d98fa
 
5d665be
 
30d98fa
 
 
 
 
 
 
25fe98a
30d98fa
 
f8d05a7
 
30d98fa
576227b
5d665be
 
60ee681
 
 
 
 
 
 
5d665be
 
60ee681
5d665be
 
 
 
60ee681
 
5d665be
f8d05a7
25fe98a
60ee681
25fe98a
5d665be
25fe98a
 
5d665be
 
30d98fa
 
 
5d665be
30d98fa
 
 
5d665be
30d98fa
 
 
 
 
5d665be
 
 
 
 
 
30d98fa

import importlib
from collections import defaultdict
import re
import time

__all__ = ['GetWebsite']


class GetWebsite():
    dependencies = ["requests", "beautifulsoup4==4.13.3"]

    inputSchema = {
        "name": "GetWebsite",
        "description": "Returns the content of a website with enhanced error handling and output options.",
        "parameters": {
            "type": "object",
            "properties": {
                "url": {
                    "type": "string",
                    "description": "The URL of the website to fetch content from.",
                },
                "output_type": {
                    "type": "string",
                    "enum": ["summary", "full_text", "html"],
                    "description": "The type of output to return. 'summary' returns a summary of the text, 'full_text' returns the full text content, and 'html' returns the raw HTML content.",
                    "default": "summary"
                },
                "css_selector": {
                    "type": "string",
                    "description": "A CSS selector to extract specific content from the page.",
                }
            },
            "required": ["url"],
        }
    }

    def summarize_text(self, text):
        # Clean the text more thoroughly
        text = re.sub(r'\[[0-9]*\]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^a-zA-Z0-9.\s]', '', text)  # Remove special characters except periods

        # Tokenize into sentences
        sentences = re.split(r'(?<=[.!?])\s+', text)
        sentences = [s.strip() for s in sentences if s]

        # Calculate word frequencies
        word_frequencies = defaultdict(int)
        for sentence in sentences:
            words = sentence.lower().split()
            for word in words:
                word_frequencies[word] += 1

        # Normalize word frequencies
        total_words = sum(word_frequencies.values())
        if total_words > 0:
            for word in word_frequencies:
                word_frequencies[word] /= total_words

        # Calculate sentence scores based on word frequencies, sentence length, and coherence
        sentence_scores = {}
        for i, sentence in enumerate(sentences):
            score = 0
            words = sentence.lower().split()
            for word in words:
                score += word_frequencies[word]

            # Consider sentence length
            sentence_length_factor = 1 - abs(len(words) - 15) / 15  # Prefer sentences around 15 words
            score += sentence_length_factor * 0.1

            # Add a coherence score
            if i > 0 and sentences[i - 1] in sentence_scores:
                previous_sentence_words = sentences[i - 1].lower().split()
                common_words = set(words) & set(previous_sentence_words)
                coherence_score = len(common_words) / len(words)
                score += coherence_score * 0.1

            sentence_scores[sentence] = score

        # Get the top 3 sentences with the highest scores
        ranked_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:3]

        # Generate the summary
        summary = ". ".join(ranked_sentences) + "."
        return summary

    def run(self, **kwargs):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'DNT': '1',
            'Sec-GPC': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Priority': 'u=0, i',
        }
        print("Running enhanced web scraper")

        url = kwargs.get("url")
        output_type = kwargs.get("output_type", "summary")
        css_selector = kwargs.get("css_selector")

        if not url:
            return {
                "status": "error",
                "message": "Missing required parameters: 'url'",
                "output": None
            }

        output = None
        requests = importlib.import_module("requests")
        bs4 = importlib.import_module("bs4")
        BeautifulSoup = bs4.BeautifulSoup
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
            response.encoding = response.apparent_encoding  # Handle encoding
            if output_type == "html":
                # Return the raw HTML content
                return {
                    "status": "success",
                    "message": "Search completed successfully",
                    "output": response.text,
                }

            # Parse the content using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')

            if css_selector:
                # Extract text from the selected elements
                elements = soup.select(css_selector)
                text = ('\n'.join([element.get_text() for element in elements]))
                text = text.encode('utf-8', 'ignore').decode('utf-8')
            else:
                # Extract text from the parsed HTML
                text = soup.get_text()
                text = text.encode('utf-8', 'ignore').decode('utf-8')

            if output_type == "summary":
                # Summarize the text
                output = self.summarize_text(text)
            elif output_type == "full_text":
                output = text
            else:
                return {
                    "status": "error",
                    "message": f"Invalid output_type: {output_type}",
                    "output": None
                }


            return {
                "status": "success",
                "message": "Search completed successfully",
                "output": output,
            }
        except requests.exceptions.RequestException as e:
            return {
                "status": "error",
                "message": f"Request failed: {str(e)}",
                "output": None
            }
        except Exception as e:
            return {
                "status": "error",
                "message": str(e),
                "output": None
            }