File size: 6,514 Bytes
30d98fa 25fe98a 30d98fa 5d665be 30d98fa 5d665be f8d05a7 30d98fa 5d665be 30d98fa 5d665be 60ee681 5d665be 30d98fa 25fe98a 5d665be 25fe98a 5d665be 25fe98a 30d98fa 576227b 5d665be 30d98fa 5d665be 30d98fa 25fe98a 30d98fa f8d05a7 30d98fa 576227b 5d665be 60ee681 5d665be 60ee681 5d665be 60ee681 5d665be f8d05a7 25fe98a 60ee681 25fe98a 5d665be 25fe98a 5d665be 30d98fa 5d665be 30d98fa 5d665be 30d98fa 5d665be 30d98fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
import importlib
from collections import defaultdict
import re
import time
__all__ = ['GetWebsite']
class GetWebsite():
dependencies = ["requests", "beautifulsoup4==4.13.3"]
inputSchema = {
"name": "GetWebsite",
"description": "Returns the content of a website with enhanced error handling and output options.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The URL of the website to fetch content from.",
},
"output_type": {
"type": "string",
"enum": ["summary", "full_text", "html"],
"description": "The type of output to return. 'summary' returns a summary of the text, 'full_text' returns the full text content, and 'html' returns the raw HTML content.",
"default": "summary"
},
"css_selector": {
"type": "string",
"description": "A CSS selector to extract specific content from the page.",
}
},
"required": ["url"],
}
}
def summarize_text(self, text):
# Clean the text more thoroughly
text = re.sub(r'\[[0-9]*\]', ' ', text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^a-zA-Z0-9.\s]', '', text) # Remove special characters except periods
# Tokenize into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)
sentences = [s.strip() for s in sentences if s]
# Calculate word frequencies
word_frequencies = defaultdict(int)
for sentence in sentences:
words = sentence.lower().split()
for word in words:
word_frequencies[word] += 1
# Normalize word frequencies
total_words = sum(word_frequencies.values())
if total_words > 0:
for word in word_frequencies:
word_frequencies[word] /= total_words
# Calculate sentence scores based on word frequencies, sentence length, and coherence
sentence_scores = {}
for i, sentence in enumerate(sentences):
score = 0
words = sentence.lower().split()
for word in words:
score += word_frequencies[word]
# Consider sentence length
sentence_length_factor = 1 - abs(len(words) - 15) / 15 # Prefer sentences around 15 words
score += sentence_length_factor * 0.1
# Add a coherence score
if i > 0 and sentences[i - 1] in sentence_scores:
previous_sentence_words = sentences[i - 1].lower().split()
common_words = set(words) & set(previous_sentence_words)
coherence_score = len(common_words) / len(words)
score += coherence_score * 0.1
sentence_scores[sentence] = score
# Get the top 3 sentences with the highest scores
ranked_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:3]
# Generate the summary
summary = ". ".join(ranked_sentences) + "."
return summary
def run(self, **kwargs):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'DNT': '1',
'Sec-GPC': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Priority': 'u=0, i',
}
print("Running enhanced web scraper")
url = kwargs.get("url")
output_type = kwargs.get("output_type", "summary")
css_selector = kwargs.get("css_selector")
if not url:
return {
"status": "error",
"message": "Missing required parameters: 'url'",
"output": None
}
output = None
requests = importlib.import_module("requests")
bs4 = importlib.import_module("bs4")
BeautifulSoup = bs4.BeautifulSoup
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
response.encoding = response.apparent_encoding # Handle encoding
if output_type == "html":
# Return the raw HTML content
return {
"status": "success",
"message": "Search completed successfully",
"output": response.text,
}
# Parse the content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
if css_selector:
# Extract text from the selected elements
elements = soup.select(css_selector)
text = ('\n'.join([element.get_text() for element in elements]))
text = text.encode('utf-8', 'ignore').decode('utf-8')
else:
# Extract text from the parsed HTML
text = soup.get_text()
text = text.encode('utf-8', 'ignore').decode('utf-8')
if output_type == "summary":
# Summarize the text
output = self.summarize_text(text)
elif output_type == "full_text":
output = text
else:
return {
"status": "error",
"message": f"Invalid output_type: {output_type}",
"output": None
}
return {
"status": "success",
"message": "Search completed successfully",
"output": output,
}
except requests.exceptions.RequestException as e:
return {
"status": "error",
"message": f"Request failed: {str(e)}",
"output": None
}
except Exception as e:
return {
"status": "error",
"message": str(e),
"output": None
}
|