Shreyas094 commited on
Commit
7678968
·
verified ·
1 Parent(s): 86ff084

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -10
app.py CHANGED
@@ -25,6 +25,9 @@ from sklearn.feature_extraction.text import TfidfVectorizer
25
  from sklearn.metrics.pairwise import cosine_similarity
26
  from trafilatura import fetch_url, extract
27
  import json
 
 
 
28
 
29
 
30
  # Set up basic configuration for logging
@@ -280,15 +283,20 @@ def generate_chunked_response(prompt, model, max_tokens=10000, num_calls=3, temp
280
 
281
  class SimpleDDGSearch:
282
  def search(self, query: str, num_results: int = 5):
283
- results = DDGS().text(query, region='wt-wt', safesearch='off', max_results=num_results)
284
- return [res["href"] for res in results]
 
 
 
 
 
285
 
286
  class TrafilaturaWebCrawler:
287
  def get_website_content_from_url(self, url: str) -> str:
288
  try:
289
  downloaded = fetch_url(url)
290
  if downloaded is None:
291
- return f"Failed to fetch content from URL: {url}"
292
 
293
  result = extract(downloaded, output_format='json', include_comments=False, with_metadata=True, url=url)
294
  if result:
@@ -301,21 +309,33 @@ class TrafilaturaWebCrawler:
301
 
302
  return f'=========== Website Title: {title} ===========\n\n=========== Website URL: {url} ===========\n\n=========== Website Content ===========\n\n{content}\n\n=========== Website Content End ===========\n\n'
303
  else:
304
- return f"No content extracted from URL: {url}"
305
  except Exception as e:
306
- return f"An error occurred while processing {url}: {str(e)}"
 
307
 
308
  def search_and_crawl(query: str, num_results: int = 10):
309
  searcher = SimpleDDGSearch()
310
- search_results = searcher.search(query, num_results=num_results)
311
 
312
  crawler = TrafilaturaWebCrawler()
313
  output = ""
 
314
 
315
- for i, url in enumerate(search_results):
316
- output += f"Results for URL {i+1}: {url}\n\n"
317
- output += crawler.get_website_content_from_url(url) + "\n"
318
- output += "------------------------------------------------------------\n\n"
 
 
 
 
 
 
 
 
 
 
319
 
320
  return output
321
 
 
25
  from sklearn.metrics.pairwise import cosine_similarity
26
  from trafilatura import fetch_url, extract
27
  import json
28
+ from requests.exceptions import RequestException
29
+
30
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
31
 
32
 
33
  # Set up basic configuration for logging
 
283
 
284
  class SimpleDDGSearch:
285
  def search(self, query: str, num_results: int = 5):
286
+ results = []
287
+ with DDGS() as ddgs:
288
+ for r in ddgs.text(query, region='wt-wt', safesearch='off', max_results=num_results * 2): # Request more results than needed
289
+ results.append(r["href"])
290
+ if len(results) >= num_results:
291
+ break
292
+ return results
293
 
294
  class TrafilaturaWebCrawler:
295
  def get_website_content_from_url(self, url: str) -> str:
296
  try:
297
  downloaded = fetch_url(url)
298
  if downloaded is None:
299
+ raise RequestException(f"Failed to fetch content from URL: {url}")
300
 
301
  result = extract(downloaded, output_format='json', include_comments=False, with_metadata=True, url=url)
302
  if result:
 
309
 
310
  return f'=========== Website Title: {title} ===========\n\n=========== Website URL: {url} ===========\n\n=========== Website Content ===========\n\n{content}\n\n=========== Website Content End ===========\n\n'
311
  else:
312
+ raise ValueError(f"No content extracted from URL: {url}")
313
  except Exception as e:
314
+ logging.error(f"An error occurred while processing {url}: {str(e)}")
315
+ return None
316
 
317
  def search_and_crawl(query: str, num_results: int = 10):
318
  searcher = SimpleDDGSearch()
319
+ search_results = searcher.search(query, num_results=num_results * 2) # Request more results than needed
320
 
321
  crawler = TrafilaturaWebCrawler()
322
  output = ""
323
+ successful_crawls = 0
324
 
325
+ for url in search_results:
326
+ if successful_crawls >= num_results:
327
+ break
328
+
329
+ content = crawler.get_website_content_from_url(url)
330
+ if content:
331
+ output += f"Results for URL {successful_crawls + 1}: {url}\n\n"
332
+ output += content + "\n"
333
+ output += "------------------------------------------------------------\n\n"
334
+ successful_crawls += 1
335
+
336
+ if successful_crawls == 0:
337
+ logging.warning(f"No successful crawls for query: {query}")
338
+ return "No results could be fetched for the given query."
339
 
340
  return output
341