twimbit-ai commited on
Commit
9b25fd9
Β·
verified Β·
1 Parent(s): 67230f1

Update test_web_rag.py

Browse files
Files changed (1) hide show
  1. test_web_rag.py +60 -28
test_web_rag.py CHANGED
@@ -15,6 +15,7 @@ from dotenv import load_dotenv
15
 
16
  load_dotenv()
17
  ZENROWS_KEY = os.getenv('ZENROWS_KEY')
 
18
  client = OpenAI()
19
 
20
 
@@ -206,6 +207,31 @@ def check_url_pdf_file(url):
206
  return False
207
 
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  def zenrows_scrapper(url):
210
  zen_client = ZenRowsClient(ZENROWS_KEY)
211
  params = {"js_render": "true"}
@@ -231,33 +257,39 @@ def get_new_question_from_history(pre_question, new_question, answer):
231
  def get_docs_from_web(question, history, n_web_search, strategy):
232
  if history:
233
  question = get_new_question_from_history(history[0][0], question, history[0][1])
234
- urls = get_google_search_url(get_google_search_query(question))[:n_web_search]
235
- urls = list(set(urls))
236
  docs = ''
237
- yield f"Scraping started for {len(urls)} urls:-\n\n"
238
- for key, url in enumerate(urls):
239
- if '.pdf' in url:
240
- yield f"Scraping skipped pdf detected. {key + 1}/{len(urls)} - {url} ❌\n"
241
- continue
242
-
243
- if strategy == 'Deep':
244
- # page_source = get_page_source_selenium_base(url)
245
- page_source = zenrows_scrapper(url)
246
- formatted_page_source = format_output(page_source)
247
- formatted_page_source = clean_text(formatted_page_source)
248
- else:
249
- page_source = get_fast_url_source(url)
250
- formatted_page_source = format_output(page_source)
251
- formatted_page_source = clean_text(formatted_page_source)
252
-
253
- tokens = token_counter(formatted_page_source, 'gpt-3.5-turbo')
254
-
255
- if tokens >= 15585:
256
- yield f"Scraping skipped as token limit exceeded. {key + 1}/{len(urls)} - {url} ❌\n"
257
- continue
258
-
259
- summary = url_summary(formatted_page_source, question)
260
- docs += summary
261
- docs += '\n Source:-' + url + '\n\n'
262
- yield f"Scraping Done {key + 1}/{len(urls)} - {url} βœ…\n"
 
 
 
 
 
 
 
 
263
  yield {"data": docs}
 
15
 
16
  load_dotenv()
17
  ZENROWS_KEY = os.getenv('ZENROWS_KEY')
18
+ you_key = os.getenv("YOU_API_KEY")
19
  client = OpenAI()
20
 
21
 
 
207
  return False
208
 
209
 
210
+ def get_ai_snippets_for_query(query, num):
211
+ headers = {"X-API-Key": you_key}
212
+ params = {"query": query}
213
+ return requests.get(
214
+ f"https://api.ydc-index.io/search?query={query}&num_web_results={num}",
215
+ params=params,
216
+ headers=headers,
217
+ ).json().get('hits')
218
+
219
+
220
+ def get_web_search_you(query, num):
221
+ docs = get_ai_snippets_for_query(query, num)
222
+ markdown = ""
223
+ for doc in docs:
224
+ for key, value in doc.items():
225
+ if key == 'snippets':
226
+ markdown += f"{key}:\n"
227
+ for snippet in value:
228
+ markdown += f"- {snippet}\n"
229
+ else:
230
+ markdown += f"{key}: {value}\n"
231
+ markdown += "\n"
232
+ return markdown
233
+
234
+
235
  def zenrows_scrapper(url):
236
  zen_client = ZenRowsClient(ZENROWS_KEY)
237
  params = {"js_render": "true"}
 
257
  def get_docs_from_web(question, history, n_web_search, strategy):
258
  if history:
259
  question = get_new_question_from_history(history[0][0], question, history[0][1])
 
 
260
  docs = ''
261
+ if strategy == 'Deep Pro':
262
+ docs = get_web_search_you(question, n_web_search)
263
+ else:
264
+ urls = get_google_search_url(get_google_search_query(question))[:n_web_search]
265
+ urls = list(set(urls))
266
+ yield f"Scraping started for {len(urls)} urls:-\n\n"
267
+ for key, url in enumerate(urls):
268
+ if '.pdf' in url or '.PDF' in url:
269
+ yield f"Scraping skipped pdf detected. {key + 1}/{len(urls)} - {url} ❌\n"
270
+ continue
271
+
272
+ if strategy == 'Deep':
273
+ # page_source = get_page_source_selenium_base(url)
274
+ page_source = zenrows_scrapper(url)
275
+ formatted_page_source = format_output(page_source)
276
+ formatted_page_source = clean_text(formatted_page_source)
277
+ else:
278
+ page_source = get_fast_url_source(url)
279
+ formatted_page_source = format_output(page_source)
280
+ if formatted_page_source == '':
281
+ yield f"Scraping failed. {key + 1}/{len(urls)} - {url} ❌\n"
282
+ continue
283
+ formatted_page_source = clean_text(formatted_page_source)
284
+
285
+ tokens = token_counter(formatted_page_source, 'gpt-3.5-turbo')
286
+
287
+ if tokens >= 15585:
288
+ yield f"Scraping skipped as token limit exceeded. {key + 1}/{len(urls)} - {url} ❌\n"
289
+ continue
290
+
291
+ summary = url_summary(formatted_page_source, question)
292
+ docs += summary
293
+ docs += '\n Source:-' + url + '\n\n'
294
+ yield f"Scraping Done {key + 1}/{len(urls)} - {url} βœ…\n"
295
  yield {"data": docs}