twimbit-ai commited on
Commit
9e0ae3c
Β·
verified Β·
1 Parent(s): 48c3d72

Update test_web_rag.py

Browse files
Files changed (1) hide show
  1. test_web_rag.py +36 -21
test_web_rag.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import urllib.request
2
  from urllib.parse import quote
3
  from seleniumbase import SB
@@ -12,6 +13,7 @@ from zenrows import ZenRowsClient
12
  import requests
13
  import os
14
  from dotenv import load_dotenv
 
15
 
16
  load_dotenv()
17
  ZENROWS_KEY = os.getenv('ZENROWS_KEY')
@@ -254,6 +256,25 @@ def get_new_question_from_history(pre_question, new_question, answer):
254
  return call_open_ai(system_prompt=system_prompt, max_tokens=50)
255
 
256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  def get_docs_from_web(question, history, n_web_search, strategy):
258
  if history:
259
  question = get_new_question_from_history(history[0][0], question, history[0][1])
@@ -264,32 +285,26 @@ def get_docs_from_web(question, history, n_web_search, strategy):
264
  urls = get_google_search_url(get_google_search_query(question))[:n_web_search]
265
  urls = list(set(urls))
266
  yield f"Scraping started for {len(urls)} urls:-\n\n"
 
 
 
 
267
  for key, url in enumerate(urls):
268
  if '.pdf' in url or '.PDF' in url:
269
  yield f"Scraping skipped pdf detected. {key + 1}/{len(urls)} - {url} ❌\n"
 
270
  continue
271
 
272
- if strategy == 'Deep':
273
- # page_source = get_page_source_selenium_base(url)
274
- page_source = zenrows_scrapper(url)
275
- formatted_page_source = format_output(page_source)
276
- formatted_page_source = clean_text(formatted_page_source)
277
- else:
278
- page_source = get_fast_url_source(url)
279
- formatted_page_source = format_output(page_source)
280
- if formatted_page_source == '':
281
- yield f"Scraping failed. {key + 1}/{len(urls)} - {url} ❌\n"
282
- continue
283
- formatted_page_source = clean_text(formatted_page_source)
284
 
285
- tokens = token_counter(formatted_page_source, 'gpt-3.5-turbo')
286
-
287
- if tokens >= 15585:
288
- yield f"Scraping skipped as token limit exceeded. {key + 1}/{len(urls)} - {url} ❌\n"
289
- continue
290
 
291
- summary = url_summary(formatted_page_source, question)
292
- docs += summary
293
- docs += '\n Source:-' + url + '\n\n'
294
- yield f"Scraping Done {key + 1}/{len(urls)} - {url} βœ…\n"
 
295
  yield {"data": docs}
 
1
+ import time
2
  import urllib.request
3
  from urllib.parse import quote
4
  from seleniumbase import SB
 
13
  import requests
14
  import os
15
  from dotenv import load_dotenv
16
+ from threading import Thread
17
 
18
  load_dotenv()
19
  ZENROWS_KEY = os.getenv('ZENROWS_KEY')
 
256
  return call_open_ai(system_prompt=system_prompt, max_tokens=50)
257
 
258
 
259
+ def scraping_job(strategy, question, url, results, key):
260
+ if strategy == 'Deep':
261
+ # page_source = get_page_source_selenium_base(url)
262
+ page_source = zenrows_scrapper(url)
263
+ formatted_page_source = format_output(page_source)
264
+ formatted_page_source = clean_text(formatted_page_source)
265
+ else:
266
+ page_source = get_fast_url_source(url)
267
+ formatted_page_source = format_output(page_source)
268
+ formatted_page_source = clean_text(formatted_page_source)
269
+
270
+ tokens = token_counter(formatted_page_source, 'gpt-3.5-turbo')
271
+ if tokens >= 15585:
272
+ results[key] = ''
273
+ else:
274
+ summary = url_summary(formatted_page_source, question)
275
+ results[key] = summary
276
+
277
+
278
  def get_docs_from_web(question, history, n_web_search, strategy):
279
  if history:
280
  question = get_new_question_from_history(history[0][0], question, history[0][1])
 
285
  urls = get_google_search_url(get_google_search_query(question))[:n_web_search]
286
  urls = list(set(urls))
287
  yield f"Scraping started for {len(urls)} urls:-\n\n"
288
+
289
+ threads = [None] * len(urls)
290
+ results = [None] * len(urls)
291
+
292
  for key, url in enumerate(urls):
293
  if '.pdf' in url or '.PDF' in url:
294
  yield f"Scraping skipped pdf detected. {key + 1}/{len(urls)} - {url} ❌\n"
295
+ results[key] = ''
296
  continue
297
 
298
+ threads[key] = Thread(target=scraping_job, args=(strategy, question, url, results, key))
299
+ threads[key].start()
 
 
 
 
 
 
 
 
 
 
300
 
301
+ for i in range(len(threads)):
302
+ if threads[i] is not None:
303
+ threads[i].join()
 
 
304
 
305
+ for key, result in enumerate(results):
306
+ if result is not None and result != '':
307
+ docs += result
308
+ docs += '\n Source:-' + urls[key] + '\n\n'
309
+ yield f"Scraping Done {key + 1}/{len(urls)} - {urls[key]} βœ…\n"
310
  yield {"data": docs}