gperdrizet commited on
Commit
cade3d1
·
verified ·
1 Parent(s): 0000c5e

Completed vector context search tool.

Browse files
Files changed (3) hide show
  1. functions/rag.py +20 -10
  2. functions/tools.py +5 -3
  3. rss_server.py +15 -3
functions/rag.py CHANGED
@@ -5,14 +5,14 @@ import logging
5
  import queue
6
  from semantic_text_splitter import TextSplitter
7
  from tokenizers import Tokenizer
8
- from upstash_vector import Index, Vector
9
 
10
 
11
  def ingest(rag_ingest_queue: queue.Queue) -> None:
12
  '''Semantically chunks article and upsert to Upstash vector db
13
  using article title as namespace.'''
14
 
15
- logger = logging.getLevelName(__name__ + '.ingest()')
16
 
17
  index = Index(
18
  url='https://living-whale-89944-us1-vector.upstash.io',
@@ -24,27 +24,37 @@ def ingest(rag_ingest_queue: queue.Queue) -> None:
24
  namespaces = index.list_namespaces()
25
 
26
  item = rag_ingest_queue.get()
 
27
  title = item['title']
28
- text = item['content']
29
- logger.info('Got %s from RAG ingest queue', title)
30
 
31
  if title not in namespaces:
 
 
32
 
33
  tokenizer=Tokenizer.from_pretrained('bert-base-uncased')
34
  splitter=TextSplitter.from_huggingface_tokenizer(tokenizer, 256)
35
  chunks=splitter.chunks(text)
36
 
37
  for i, chunk in enumerate(chunks):
 
 
 
 
 
 
 
 
 
 
38
  index.upsert(
39
- vectors=[
40
- Vector(
41
- id=hash(f'{title}-{i}'),
42
- data=chunk,
 
43
  )
44
  ],
45
- namespace=title
46
  )
47
-
48
  logger.info('Ingested %s chunks into vector DB', i + 1)
49
 
50
  else:
 
5
  import queue
6
  from semantic_text_splitter import TextSplitter
7
  from tokenizers import Tokenizer
8
+ from upstash_vector import Index
9
 
10
 
11
  def ingest(rag_ingest_queue: queue.Queue) -> None:
12
  '''Semantically chunks article and upsert to Upstash vector db
13
  using article title as namespace.'''
14
 
15
+ logger = logging.getLogger(__name__ + '.ingest()')
16
 
17
  index = Index(
18
  url='https://living-whale-89944-us1-vector.upstash.io',
 
24
  namespaces = index.list_namespaces()
25
 
26
  item = rag_ingest_queue.get()
27
+ logger.info(item)
28
  title = item['title']
 
 
29
 
30
  if title not in namespaces:
31
+ text = item['content']
32
+ logger.info('Got "%s" from RAG ingest queue', title)
33
 
34
  tokenizer=Tokenizer.from_pretrained('bert-base-uncased')
35
  splitter=TextSplitter.from_huggingface_tokenizer(tokenizer, 256)
36
  chunks=splitter.chunks(text)
37
 
38
  for i, chunk in enumerate(chunks):
39
+ # index.upsert(
40
+ # vectors=[
41
+ # Vector(
42
+ # id=hash(f'{title}-{i}'),
43
+ # data=chunk,
44
+ # )
45
+ # ],
46
+ # namespace=title
47
+ # )
48
+
49
  index.upsert(
50
+ [
51
+ (
52
+ hash(f'{title}-{i}'),
53
+ chunk,
54
+ {'namespace': title}
55
  )
56
  ],
 
57
  )
 
58
  logger.info('Ingested %s chunks into vector DB', i + 1)
59
 
60
  else:
functions/tools.py CHANGED
@@ -59,7 +59,7 @@ def get_feed(website: str) -> list:
59
  if item['content'] is not None:
60
 
61
  RAG_INGEST_QUEUE.put(item)
62
- logger.info('%s sent to RAG ingest', item['title'])
63
 
64
  summary = summarization_funcs.summarize_content(
65
  item['title'],
@@ -67,7 +67,7 @@ def get_feed(website: str) -> list:
67
  )
68
 
69
  content[i]['summary'] = summary
70
- logger.info('Summary of %s generated', item['title'])
71
 
72
  content[i].pop('content', None)
73
 
@@ -96,8 +96,10 @@ def context_search(query: str, article_title: str = None) -> str:
96
  results = None
97
 
98
  results = index.query(
99
- [query],
100
  top_k=3,
 
 
101
  namespace=article_title
102
  )
103
 
 
59
  if item['content'] is not None:
60
 
61
  RAG_INGEST_QUEUE.put(item)
62
+ logger.info('"%s" sent to RAG ingest', item['title'])
63
 
64
  summary = summarization_funcs.summarize_content(
65
  item['title'],
 
67
  )
68
 
69
  content[i]['summary'] = summary
70
+ logger.info('Summary of "%s" generated', item['title'])
71
 
72
  content[i].pop('content', None)
73
 
 
96
  results = None
97
 
98
  results = index.query(
99
+ data=query,
100
  top_k=3,
101
+ include_metadata=True,
102
+ include_data=True,
103
  namespace=article_title
104
  )
105
 
rss_server.py CHANGED
@@ -40,7 +40,7 @@ with gr.Blocks() as demo:
40
  gr.HTML(html.DESCRIPTION)
41
 
42
  # Log output
43
- dialog_output = gr.Textbox(label='Server logs', lines=10, max_lines=100)
44
  timer = gr.Timer(0.5, active=True)
45
 
46
  timer.tick( # pylint: disable=no-member
@@ -51,16 +51,28 @@ with gr.Blocks() as demo:
51
 
52
  # Get feed tool
53
  website_url = gr.Textbox('hackernews.com', label='Website')
54
- output = gr.Textbox(label='RSS entries', lines=10)
55
  submit_button = gr.Button('Submit')
56
 
57
  submit_button.click( # pylint: disable=no-member
58
  fn=tool_funcs.get_feed,
59
  inputs=website_url,
60
- outputs=output,
61
  api_name='Get RSS feed content'
62
  )
63
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  if __name__ == '__main__':
66
 
 
40
  gr.HTML(html.DESCRIPTION)
41
 
42
  # Log output
43
+ dialog_output = gr.Textbox(label='Server logs', lines=10, max_lines=10)
44
  timer = gr.Timer(0.5, active=True)
45
 
46
  timer.tick( # pylint: disable=no-member
 
51
 
52
  # Get feed tool
53
  website_url = gr.Textbox('hackernews.com', label='Website')
54
+ feed_output = gr.Textbox(label='RSS entries', lines=10, max_lines=10)
55
  submit_button = gr.Button('Submit')
56
 
57
  submit_button.click( # pylint: disable=no-member
58
  fn=tool_funcs.get_feed,
59
  inputs=website_url,
60
+ outputs=feed_output,
61
  api_name='Get RSS feed content'
62
  )
63
 
64
+ # Vector search tool
65
+ search_query = gr.Textbox('Does apple offer parental controls?', label='Vector search query')
66
+ search_output = gr.Textbox(label='Vector search results', lines=10, max_lines=10)
67
+ submit_button = gr.Button('Submit')
68
+
69
+ submit_button.click( # pylint: disable=no-member
70
+ fn=tool_funcs.context_search,
71
+ inputs=search_query,
72
+ outputs=search_output,
73
+ api_name='Context vector search'
74
+ )
75
+
76
 
77
  if __name__ == '__main__':
78