Completed vector context search tool.
Browse files- functions/rag.py +20 -10
- functions/tools.py +5 -3
- rss_server.py +15 -3
functions/rag.py
CHANGED
@@ -5,14 +5,14 @@ import logging
|
|
5 |
import queue
|
6 |
from semantic_text_splitter import TextSplitter
|
7 |
from tokenizers import Tokenizer
|
8 |
-
from upstash_vector import Index
|
9 |
|
10 |
|
11 |
def ingest(rag_ingest_queue: queue.Queue) -> None:
|
12 |
'''Semantically chunks article and upsert to Upstash vector db
|
13 |
using article title as namespace.'''
|
14 |
|
15 |
-
logger = logging.
|
16 |
|
17 |
index = Index(
|
18 |
url='https://living-whale-89944-us1-vector.upstash.io',
|
@@ -24,27 +24,37 @@ def ingest(rag_ingest_queue: queue.Queue) -> None:
|
|
24 |
namespaces = index.list_namespaces()
|
25 |
|
26 |
item = rag_ingest_queue.get()
|
|
|
27 |
title = item['title']
|
28 |
-
text = item['content']
|
29 |
-
logger.info('Got %s from RAG ingest queue', title)
|
30 |
|
31 |
if title not in namespaces:
|
|
|
|
|
32 |
|
33 |
tokenizer=Tokenizer.from_pretrained('bert-base-uncased')
|
34 |
splitter=TextSplitter.from_huggingface_tokenizer(tokenizer, 256)
|
35 |
chunks=splitter.chunks(text)
|
36 |
|
37 |
for i, chunk in enumerate(chunks):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
index.upsert(
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
43 |
)
|
44 |
],
|
45 |
-
namespace=title
|
46 |
)
|
47 |
-
|
48 |
logger.info('Ingested %s chunks into vector DB', i + 1)
|
49 |
|
50 |
else:
|
|
|
5 |
import queue
|
6 |
from semantic_text_splitter import TextSplitter
|
7 |
from tokenizers import Tokenizer
|
8 |
+
from upstash_vector import Index
|
9 |
|
10 |
|
11 |
def ingest(rag_ingest_queue: queue.Queue) -> None:
|
12 |
'''Semantically chunks article and upsert to Upstash vector db
|
13 |
using article title as namespace.'''
|
14 |
|
15 |
+
logger = logging.getLogger(__name__ + '.ingest()')
|
16 |
|
17 |
index = Index(
|
18 |
url='https://living-whale-89944-us1-vector.upstash.io',
|
|
|
24 |
namespaces = index.list_namespaces()
|
25 |
|
26 |
item = rag_ingest_queue.get()
|
27 |
+
logger.info(item)
|
28 |
title = item['title']
|
|
|
|
|
29 |
|
30 |
if title not in namespaces:
|
31 |
+
text = item['content']
|
32 |
+
logger.info('Got "%s" from RAG ingest queue', title)
|
33 |
|
34 |
tokenizer=Tokenizer.from_pretrained('bert-base-uncased')
|
35 |
splitter=TextSplitter.from_huggingface_tokenizer(tokenizer, 256)
|
36 |
chunks=splitter.chunks(text)
|
37 |
|
38 |
for i, chunk in enumerate(chunks):
|
39 |
+
# index.upsert(
|
40 |
+
# vectors=[
|
41 |
+
# Vector(
|
42 |
+
# id=hash(f'{title}-{i}'),
|
43 |
+
# data=chunk,
|
44 |
+
# )
|
45 |
+
# ],
|
46 |
+
# namespace=title
|
47 |
+
# )
|
48 |
+
|
49 |
index.upsert(
|
50 |
+
[
|
51 |
+
(
|
52 |
+
hash(f'{title}-{i}'),
|
53 |
+
chunk,
|
54 |
+
{'namespace': title}
|
55 |
)
|
56 |
],
|
|
|
57 |
)
|
|
|
58 |
logger.info('Ingested %s chunks into vector DB', i + 1)
|
59 |
|
60 |
else:
|
functions/tools.py
CHANGED
@@ -59,7 +59,7 @@ def get_feed(website: str) -> list:
|
|
59 |
if item['content'] is not None:
|
60 |
|
61 |
RAG_INGEST_QUEUE.put(item)
|
62 |
-
logger.info('%s sent to RAG ingest', item['title'])
|
63 |
|
64 |
summary = summarization_funcs.summarize_content(
|
65 |
item['title'],
|
@@ -67,7 +67,7 @@ def get_feed(website: str) -> list:
|
|
67 |
)
|
68 |
|
69 |
content[i]['summary'] = summary
|
70 |
-
logger.info('Summary of %s generated', item['title'])
|
71 |
|
72 |
content[i].pop('content', None)
|
73 |
|
@@ -96,8 +96,10 @@ def context_search(query: str, article_title: str = None) -> str:
|
|
96 |
results = None
|
97 |
|
98 |
results = index.query(
|
99 |
-
|
100 |
top_k=3,
|
|
|
|
|
101 |
namespace=article_title
|
102 |
)
|
103 |
|
|
|
59 |
if item['content'] is not None:
|
60 |
|
61 |
RAG_INGEST_QUEUE.put(item)
|
62 |
+
logger.info('"%s" sent to RAG ingest', item['title'])
|
63 |
|
64 |
summary = summarization_funcs.summarize_content(
|
65 |
item['title'],
|
|
|
67 |
)
|
68 |
|
69 |
content[i]['summary'] = summary
|
70 |
+
logger.info('Summary of "%s" generated', item['title'])
|
71 |
|
72 |
content[i].pop('content', None)
|
73 |
|
|
|
96 |
results = None
|
97 |
|
98 |
results = index.query(
|
99 |
+
data=query,
|
100 |
top_k=3,
|
101 |
+
include_metadata=True,
|
102 |
+
include_data=True,
|
103 |
namespace=article_title
|
104 |
)
|
105 |
|
rss_server.py
CHANGED
@@ -40,7 +40,7 @@ with gr.Blocks() as demo:
|
|
40 |
gr.HTML(html.DESCRIPTION)
|
41 |
|
42 |
# Log output
|
43 |
-
dialog_output = gr.Textbox(label='Server logs', lines=10, max_lines=
|
44 |
timer = gr.Timer(0.5, active=True)
|
45 |
|
46 |
timer.tick( # pylint: disable=no-member
|
@@ -51,16 +51,28 @@ with gr.Blocks() as demo:
|
|
51 |
|
52 |
# Get feed tool
|
53 |
website_url = gr.Textbox('hackernews.com', label='Website')
|
54 |
-
|
55 |
submit_button = gr.Button('Submit')
|
56 |
|
57 |
submit_button.click( # pylint: disable=no-member
|
58 |
fn=tool_funcs.get_feed,
|
59 |
inputs=website_url,
|
60 |
-
outputs=
|
61 |
api_name='Get RSS feed content'
|
62 |
)
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
if __name__ == '__main__':
|
66 |
|
|
|
40 |
gr.HTML(html.DESCRIPTION)
|
41 |
|
42 |
# Log output
|
43 |
+
dialog_output = gr.Textbox(label='Server logs', lines=10, max_lines=10)
|
44 |
timer = gr.Timer(0.5, active=True)
|
45 |
|
46 |
timer.tick( # pylint: disable=no-member
|
|
|
51 |
|
52 |
# Get feed tool
|
53 |
website_url = gr.Textbox('hackernews.com', label='Website')
|
54 |
+
feed_output = gr.Textbox(label='RSS entries', lines=10, max_lines=10)
|
55 |
submit_button = gr.Button('Submit')
|
56 |
|
57 |
submit_button.click( # pylint: disable=no-member
|
58 |
fn=tool_funcs.get_feed,
|
59 |
inputs=website_url,
|
60 |
+
outputs=feed_output,
|
61 |
api_name='Get RSS feed content'
|
62 |
)
|
63 |
|
64 |
+
# Vector search tool
|
65 |
+
search_query = gr.Textbox('Does apple offer parental controls?', label='Vector search query')
|
66 |
+
search_output = gr.Textbox(label='Vector search results', lines=10, max_lines=10)
|
67 |
+
submit_button = gr.Button('Submit')
|
68 |
+
|
69 |
+
submit_button.click( # pylint: disable=no-member
|
70 |
+
fn=tool_funcs.context_search,
|
71 |
+
inputs=search_query,
|
72 |
+
outputs=search_output,
|
73 |
+
api_name='Context vector search'
|
74 |
+
)
|
75 |
+
|
76 |
|
77 |
if __name__ == '__main__':
|
78 |
|