File size: 6,514 Bytes
30d98fa
25fe98a
 
 
30d98fa
5d665be
30d98fa
 
5d665be
f8d05a7
30d98fa
 
5d665be
 
30d98fa
 
 
 
 
 
 
5d665be
 
60ee681
 
 
5d665be
 
 
 
 
30d98fa
 
 
 
 
25fe98a
 
 
 
5d665be
25fe98a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d665be
 
25fe98a
 
 
 
 
 
 
 
 
 
 
 
 
30d98fa
576227b
 
 
 
 
 
 
 
 
 
 
 
 
 
5d665be
30d98fa
 
5d665be
 
30d98fa
 
 
 
 
 
 
25fe98a
30d98fa
 
f8d05a7
 
30d98fa
576227b
5d665be
 
60ee681
 
 
 
 
 
 
5d665be
 
60ee681
5d665be
 
 
 
60ee681
 
5d665be
f8d05a7
25fe98a
60ee681
25fe98a
5d665be
25fe98a
 
5d665be
 
30d98fa
 
 
5d665be
30d98fa
 
 
5d665be
30d98fa
 
 
 
 
5d665be
 
 
 
 
 
30d98fa
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import importlib
from collections import defaultdict
import re
import time

__all__ = ['GetWebsite']


class GetWebsite():
    dependencies = ["requests", "beautifulsoup4==4.13.3"]

    inputSchema = {
        "name": "GetWebsite",
        "description": "Returns the content of a website with enhanced error handling and output options.",
        "parameters": {
            "type": "object",
            "properties": {
                "url": {
                    "type": "string",
                    "description": "The URL of the website to fetch content from.",
                },
                "output_type": {
                    "type": "string",
                    "enum": ["summary", "full_text", "html"],
                    "description": "The type of output to return. 'summary' returns a summary of the text, 'full_text' returns the full text content, and 'html' returns the raw HTML content.",
                    "default": "summary"
                },
                "css_selector": {
                    "type": "string",
                    "description": "A CSS selector to extract specific content from the page.",
                }
            },
            "required": ["url"],
        }
    }

    def summarize_text(self, text):
        # Clean the text more thoroughly
        text = re.sub(r'\[[0-9]*\]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^a-zA-Z0-9.\s]', '', text)  # Remove special characters except periods

        # Tokenize into sentences
        sentences = re.split(r'(?<=[.!?])\s+', text)
        sentences = [s.strip() for s in sentences if s]

        # Calculate word frequencies
        word_frequencies = defaultdict(int)
        for sentence in sentences:
            words = sentence.lower().split()
            for word in words:
                word_frequencies[word] += 1

        # Normalize word frequencies
        total_words = sum(word_frequencies.values())
        if total_words > 0:
            for word in word_frequencies:
                word_frequencies[word] /= total_words

        # Calculate sentence scores based on word frequencies, sentence length, and coherence
        sentence_scores = {}
        for i, sentence in enumerate(sentences):
            score = 0
            words = sentence.lower().split()
            for word in words:
                score += word_frequencies[word]

            # Consider sentence length
            sentence_length_factor = 1 - abs(len(words) - 15) / 15  # Prefer sentences around 15 words
            score += sentence_length_factor * 0.1

            # Add a coherence score
            if i > 0 and sentences[i - 1] in sentence_scores:
                previous_sentence_words = sentences[i - 1].lower().split()
                common_words = set(words) & set(previous_sentence_words)
                coherence_score = len(common_words) / len(words)
                score += coherence_score * 0.1

            sentence_scores[sentence] = score

        # Get the top 3 sentences with the highest scores
        ranked_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:3]

        # Generate the summary
        summary = ". ".join(ranked_sentences) + "."
        return summary

    def run(self, **kwargs):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'DNT': '1',
            'Sec-GPC': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Priority': 'u=0, i',
        }
        print("Running enhanced web scraper")

        url = kwargs.get("url")
        output_type = kwargs.get("output_type", "summary")
        css_selector = kwargs.get("css_selector")

        if not url:
            return {
                "status": "error",
                "message": "Missing required parameters: 'url'",
                "output": None
            }

        output = None
        requests = importlib.import_module("requests")
        bs4 = importlib.import_module("bs4")
        BeautifulSoup = bs4.BeautifulSoup
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
            response.encoding = response.apparent_encoding  # Handle encoding
            if output_type == "html":
                # Return the raw HTML content
                return {
                    "status": "success",
                    "message": "Search completed successfully",
                    "output": response.text,
                }

            # Parse the content using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')

            if css_selector:
                # Extract text from the selected elements
                elements = soup.select(css_selector)
                text = ('\n'.join([element.get_text() for element in elements]))
                text = text.encode('utf-8', 'ignore').decode('utf-8')
            else:
                # Extract text from the parsed HTML
                text = soup.get_text()
                text = text.encode('utf-8', 'ignore').decode('utf-8')

            if output_type == "summary":
                # Summarize the text
                output = self.summarize_text(text)
            elif output_type == "full_text":
                output = text
            else:
                return {
                    "status": "error",
                    "message": f"Invalid output_type: {output_type}",
                    "output": None
                }


            return {
                "status": "success",
                "message": "Search completed successfully",
                "output": output,
            }
        except requests.exceptions.RequestException as e:
            return {
                "status": "error",
                "message": f"Request failed: {str(e)}",
                "output": None
            }
        except Exception as e:
            return {
                "status": "error",
                "message": str(e),
                "output": None
            }