Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -25,6 +25,9 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
|
25 |
from sklearn.metrics.pairwise import cosine_similarity
|
26 |
from trafilatura import fetch_url, extract
|
27 |
import json
|
|
|
|
|
|
|
28 |
|
29 |
|
30 |
# Set up basic configuration for logging
|
@@ -280,15 +283,20 @@ def generate_chunked_response(prompt, model, max_tokens=10000, num_calls=3, temp
|
|
280 |
|
281 |
class SimpleDDGSearch:
|
282 |
def search(self, query: str, num_results: int = 5):
|
283 |
-
results =
|
284 |
-
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
class TrafilaturaWebCrawler:
|
287 |
def get_website_content_from_url(self, url: str) -> str:
|
288 |
try:
|
289 |
downloaded = fetch_url(url)
|
290 |
if downloaded is None:
|
291 |
-
|
292 |
|
293 |
result = extract(downloaded, output_format='json', include_comments=False, with_metadata=True, url=url)
|
294 |
if result:
|
@@ -301,21 +309,33 @@ class TrafilaturaWebCrawler:
|
|
301 |
|
302 |
return f'=========== Website Title: {title} ===========\n\n=========== Website URL: {url} ===========\n\n=========== Website Content ===========\n\n{content}\n\n=========== Website Content End ===========\n\n'
|
303 |
else:
|
304 |
-
|
305 |
except Exception as e:
|
306 |
-
|
|
|
307 |
|
308 |
def search_and_crawl(query: str, num_results: int = 10):
|
309 |
searcher = SimpleDDGSearch()
|
310 |
-
search_results = searcher.search(query, num_results=num_results)
|
311 |
|
312 |
crawler = TrafilaturaWebCrawler()
|
313 |
output = ""
|
|
|
314 |
|
315 |
-
for
|
316 |
-
|
317 |
-
|
318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
|
320 |
return output
|
321 |
|
|
|
25 |
from sklearn.metrics.pairwise import cosine_similarity
|
26 |
from trafilatura import fetch_url, extract
|
27 |
import json
|
28 |
+
from requests.exceptions import RequestException
|
29 |
+
|
30 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
31 |
|
32 |
|
33 |
# Set up basic configuration for logging
|
|
|
283 |
|
284 |
class SimpleDDGSearch:
|
285 |
def search(self, query: str, num_results: int = 5):
|
286 |
+
results = []
|
287 |
+
with DDGS() as ddgs:
|
288 |
+
for r in ddgs.text(query, region='wt-wt', safesearch='off', max_results=num_results * 2): # Request more results than needed
|
289 |
+
results.append(r["href"])
|
290 |
+
if len(results) >= num_results:
|
291 |
+
break
|
292 |
+
return results
|
293 |
|
294 |
class TrafilaturaWebCrawler:
|
295 |
def get_website_content_from_url(self, url: str) -> str:
|
296 |
try:
|
297 |
downloaded = fetch_url(url)
|
298 |
if downloaded is None:
|
299 |
+
raise RequestException(f"Failed to fetch content from URL: {url}")
|
300 |
|
301 |
result = extract(downloaded, output_format='json', include_comments=False, with_metadata=True, url=url)
|
302 |
if result:
|
|
|
309 |
|
310 |
return f'=========== Website Title: {title} ===========\n\n=========== Website URL: {url} ===========\n\n=========== Website Content ===========\n\n{content}\n\n=========== Website Content End ===========\n\n'
|
311 |
else:
|
312 |
+
raise ValueError(f"No content extracted from URL: {url}")
|
313 |
except Exception as e:
|
314 |
+
logging.error(f"An error occurred while processing {url}: {str(e)}")
|
315 |
+
return None
|
316 |
|
317 |
def search_and_crawl(query: str, num_results: int = 10):
|
318 |
searcher = SimpleDDGSearch()
|
319 |
+
search_results = searcher.search(query, num_results=num_results * 2) # Request more results than needed
|
320 |
|
321 |
crawler = TrafilaturaWebCrawler()
|
322 |
output = ""
|
323 |
+
successful_crawls = 0
|
324 |
|
325 |
+
for url in search_results:
|
326 |
+
if successful_crawls >= num_results:
|
327 |
+
break
|
328 |
+
|
329 |
+
content = crawler.get_website_content_from_url(url)
|
330 |
+
if content:
|
331 |
+
output += f"Results for URL {successful_crawls + 1}: {url}\n\n"
|
332 |
+
output += content + "\n"
|
333 |
+
output += "------------------------------------------------------------\n\n"
|
334 |
+
successful_crawls += 1
|
335 |
+
|
336 |
+
if successful_crawls == 0:
|
337 |
+
logging.warning(f"No successful crawls for query: {query}")
|
338 |
+
return "No results could be fetched for the given query."
|
339 |
|
340 |
return output
|
341 |
|