gperdrizet commited on
Commit
0000c5e
·
verified ·
1 Parent(s): 6b87253

Added RAG functions.

Browse files
Files changed (2) hide show
  1. functions/rag.py +51 -0
  2. functions/tools.py +50 -1
functions/rag.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''Collection of function for RAG on article texts.'''
2
+
3
+ import os
4
+ import logging
5
+ import queue
6
+ from semantic_text_splitter import TextSplitter
7
+ from tokenizers import Tokenizer
8
+ from upstash_vector import Index, Vector
9
+
10
+
11
+ def ingest(rag_ingest_queue: queue.Queue) -> None:
12
+ '''Semantically chunks article and upsert to Upstash vector db
13
+ using article title as namespace.'''
14
+
15
+ logger = logging.getLevelName(__name__ + '.ingest()')
16
+
17
+ index = Index(
18
+ url='https://living-whale-89944-us1-vector.upstash.io',
19
+ token=os.environ['UPSTASH_VECTOR_KEY']
20
+ )
21
+
22
+ while True:
23
+
24
+ namespaces = index.list_namespaces()
25
+
26
+ item = rag_ingest_queue.get()
27
+ title = item['title']
28
+ text = item['content']
29
+ logger.info('Got %s from RAG ingest queue', title)
30
+
31
+ if title not in namespaces:
32
+
33
+ tokenizer=Tokenizer.from_pretrained('bert-base-uncased')
34
+ splitter=TextSplitter.from_huggingface_tokenizer(tokenizer, 256)
35
+ chunks=splitter.chunks(text)
36
+
37
+ for i, chunk in enumerate(chunks):
38
+ index.upsert(
39
+ vectors=[
40
+ Vector(
41
+ id=hash(f'{title}-{i}'),
42
+ data=chunk,
43
+ )
44
+ ],
45
+ namespace=title
46
+ )
47
+
48
+ logger.info('Ingested %s chunks into vector DB', i + 1)
49
+
50
+ else:
51
+ logger.info('%s already in RAG namespace', title)
functions/tools.py CHANGED
@@ -1,10 +1,25 @@
1
  '''Tool functions for MCP server'''
2
 
 
 
3
  import time
4
  import json
5
  import logging
 
 
 
6
  import functions.feed_extraction as extraction_funcs
7
  import functions.summarization as summarization_funcs
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  def get_feed(website: str) -> list:
@@ -38,18 +53,52 @@ def get_feed(website: str) -> list:
38
  content = extraction_funcs.parse_feed(feed_uri)
39
  logger.info('parse_feed() returned %s entries', len(list(content.keys())))
40
 
41
- # Summarize each post in the feed
42
  for i, item in content.items():
43
 
44
  if item['content'] is not None:
 
 
 
 
45
  summary = summarization_funcs.summarize_content(
46
  item['title'],
47
  item['content']
48
  )
 
49
  content[i]['summary'] = summary
 
50
 
51
  content[i].pop('content', None)
52
 
53
  logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
54
 
55
  return json.dumps(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  '''Tool functions for MCP server'''
2
 
3
+ import os
4
+ import threading
5
  import time
6
  import json
7
  import logging
8
+ import queue
9
+ from upstash_vector import Index, Vector
10
+
11
  import functions.feed_extraction as extraction_funcs
12
  import functions.summarization as summarization_funcs
13
+ import functions.rag as rag_funcs
14
+
15
+ RAG_INGEST_QUEUE = queue.Queue()
16
+
17
+ rag_ingest_thread = threading.Thread(
18
+ target=rag_funcs.ingest,
19
+ args=(RAG_INGEST_QUEUE,)
20
+ )
21
+
22
+ rag_ingest_thread.start()
23
 
24
 
25
  def get_feed(website: str) -> list:
 
53
  content = extraction_funcs.parse_feed(feed_uri)
54
  logger.info('parse_feed() returned %s entries', len(list(content.keys())))
55
 
56
+ # Summarize each post in the feed and submit full text for RAG ingest
57
  for i, item in content.items():
58
 
59
  if item['content'] is not None:
60
+
61
+ RAG_INGEST_QUEUE.put(item)
62
+ logger.info('%s sent to RAG ingest', item['title'])
63
+
64
  summary = summarization_funcs.summarize_content(
65
  item['title'],
66
  item['content']
67
  )
68
+
69
  content[i]['summary'] = summary
70
+ logger.info('Summary of %s generated', item['title'])
71
 
72
  content[i].pop('content', None)
73
 
74
  logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
75
 
76
  return json.dumps(content)
77
+
78
+
79
+ def context_search(query: str, article_title: str = None) -> str:
80
+ '''Searches for context relevant to query in article vector store.
81
+
82
+ Ags:
83
+ query: user query to find context for
84
+ article_title: optional, use this argument to search only for context
85
+ from a specific context
86
+
87
+ Returns:
88
+ Context which bests matches query as string.
89
+ '''
90
+
91
+ index = Index(
92
+ url='https://living-whale-89944-us1-vector.upstash.io',
93
+ token=os.environ['UPSTASH_VECTOR_KEY']
94
+ )
95
+
96
+ results = None
97
+
98
+ results = index.query(
99
+ [query],
100
+ top_k=3,
101
+ namespace=article_title
102
+ )
103
+
104
+ return results