gperdrizet commited on
Commit
fb7ecc7
·
verified ·
1 Parent(s): a3e696b

Improved caching

Browse files
functions/feed_extraction.py CHANGED
@@ -2,7 +2,6 @@
2
 
3
  import os
4
  import re
5
- import json
6
  import logging
7
  import urllib.request
8
  from urllib.error import HTTPError, URLError
@@ -53,8 +52,8 @@ def find_feed_uri(website: str) -> str:
53
  feed_uri = FEED_URIS[website]
54
  logger.info('%s feed URI in local cache: %s', website, feed_uri)
55
 
56
- # Then, check to see if the URI is in the Redis cache
57
- cache_key = f"{website.lower().replace(' ', '_')}-feed-uri"
58
  cache_hit = False
59
 
60
  if feed_uri is None:
@@ -65,7 +64,7 @@ def find_feed_uri(website: str) -> str:
65
  feed_uri = cached_uri
66
  logger.info('%s feed URI in Redis cache: %s', website, feed_uri)
67
 
68
- # If none of those get it - try feedparse if it looks like a url
69
  # or else just google it
70
  if feed_uri is None:
71
  if website.split('.')[-1] in COMMON_EXTENSIONS:
@@ -79,6 +78,7 @@ def find_feed_uri(website: str) -> str:
79
  feed_uri = _get_feed(website_url)
80
  logger.info('get_feed() returned %s', feed_uri)
81
 
 
82
  FEED_URIS[website] = feed_uri
83
 
84
  # Add the feed URI to the redis cache if it wasn't already there
@@ -88,14 +88,16 @@ def find_feed_uri(website: str) -> str:
88
  return feed_uri
89
 
90
 
91
- def parse_feed(feed_uri: str) -> list:
92
  '''Gets content from a remote RSS feed URI.
93
 
94
  Args:
95
  feed_uri: The RSS feed to get content from
 
96
 
97
  Returns:
98
- List of titles for the 10 most recent entries in the RSS feed.
 
99
  '''
100
 
101
  logger = logging.getLogger(__name__ + '.parse_feed')
@@ -112,16 +114,15 @@ def parse_feed(feed_uri: str) -> list:
112
  if 'title' in entry and 'link' in entry:
113
 
114
  title = entry.title
 
115
 
116
- # Check the Redis cache for this entry
117
- cache_key = title.lower().replace(' ', '_')
118
- cache_hit = False
119
- cached_entry = REDIS.get(cache_key)
120
 
121
- if cached_entry:
122
- cache_hit = True
123
- entry_content = json.loads(cached_entry)
124
  logger.info('Entry in Redis cache: "%s"', title)
 
 
125
 
126
  # If its not in the Redis cache, parse it from the feed data
127
  else:
@@ -129,24 +130,26 @@ def parse_feed(feed_uri: str) -> list:
129
  entry_content['link'] = entry.link
130
  entry_content['content'] = None
131
 
 
132
  if 'content' in entry:
133
  entry_content['content'] = entry.content
134
 
135
- if entry_content['content'] is None:
 
136
 
137
  html = _get_html(entry_content['link'])
138
  content = _get_text(html)
139
  entry_content['content'] = content
140
 
141
- logger.info('Parsed entry: "%s"', title)
 
 
142
 
143
- # Add it to the Redis cache if it wasn't there
144
- if cache_hit is False:
145
- REDIS.set(cache_key, entry_content)
146
 
147
  entries[i] = entry_content
148
 
149
- if i == 2:
150
  break
151
 
152
  logger.info('Entries contains %s elements', len(list(entries.keys())))
 
2
 
3
  import os
4
  import re
 
5
  import logging
6
  import urllib.request
7
  from urllib.error import HTTPError, URLError
 
52
  feed_uri = FEED_URIS[website]
53
  logger.info('%s feed URI in local cache: %s', website, feed_uri)
54
 
55
+ # If we still haven't found it, check to see if the URI is in the Redis cache
56
+ cache_key = f'{website} feed uri'
57
  cache_hit = False
58
 
59
  if feed_uri is None:
 
64
  feed_uri = cached_uri
65
  logger.info('%s feed URI in Redis cache: %s', website, feed_uri)
66
 
67
+ # If still none of those methods get it - try feedparse if it looks like a url
68
  # or else just google it
69
  if feed_uri is None:
70
  if website.split('.')[-1] in COMMON_EXTENSIONS:
 
78
  feed_uri = _get_feed(website_url)
79
  logger.info('get_feed() returned %s', feed_uri)
80
 
81
+ # Add to local cache
82
  FEED_URIS[website] = feed_uri
83
 
84
  # Add the feed URI to the redis cache if it wasn't already there
 
88
  return feed_uri
89
 
90
 
91
+ def parse_feed(feed_uri: str, n: int) -> list:
92
  '''Gets content from a remote RSS feed URI.
93
 
94
  Args:
95
  feed_uri: The RSS feed to get content from
96
+ n: the number of feed entries to parse
97
 
98
  Returns:
99
+ List of dictionaries for the n most recent entries in the RSS feed.
100
+ Each dictionary contains 'title', 'link' and 'content' keys.
101
  '''
102
 
103
  logger = logging.getLogger(__name__ + '.parse_feed')
 
114
  if 'title' in entry and 'link' in entry:
115
 
116
  title = entry.title
117
+ entry_content['title'] = title
118
 
119
+ # Check the Redis cache
120
+ cached_link = REDIS.get(f'{title} link')
 
 
121
 
122
+ if cached_link:
 
 
123
  logger.info('Entry in Redis cache: "%s"', title)
124
+ entry_content['link'] = cached_link
125
+ entry_content['content'] = REDIS.get(f'{title} content')
126
 
127
  # If its not in the Redis cache, parse it from the feed data
128
  else:
 
130
  entry_content['link'] = entry.link
131
  entry_content['content'] = None
132
 
133
+ # Grab the article content from the feed, if provided
134
  if 'content' in entry:
135
  entry_content['content'] = entry.content
136
 
137
+ # If not, try to get the article content from the link
138
+ elif entry_content['content'] is None:
139
 
140
  html = _get_html(entry_content['link'])
141
  content = _get_text(html)
142
  entry_content['content'] = content
143
 
144
+ # Add everything to the cache
145
+ REDIS.set(f'{title} link', entry_content['link'])
146
+ REDIS.set(f'{title} content', entry_content['content'])
147
 
148
+ logger.info('Parsed entry: "%s"', title)
 
 
149
 
150
  entries[i] = entry_content
151
 
152
+ if i == n-1:
153
  break
154
 
155
  logger.info('Entries contains %s elements', len(list(entries.keys())))
functions/gradio_functions.py CHANGED
@@ -6,7 +6,7 @@ import logging
6
 
7
  from openai import OpenAI
8
 
9
- async def call_modal() -> None:
10
  '''Sends request to Modal to spin up container'''
11
 
12
  logger = logging.getLogger(__name__ + '.call_modal()')
@@ -30,7 +30,7 @@ async def call_modal() -> None:
30
  }
31
  ]
32
 
33
- logger.info('Prompt: %s': messages[0]['content'])
34
 
35
  completion_args = {
36
  'model': model_id,
 
6
 
7
  from openai import OpenAI
8
 
9
+ def call_modal() -> None:
10
  '''Sends request to Modal to spin up container'''
11
 
12
  logger = logging.getLogger(__name__ + '.call_modal()')
 
30
  }
31
  ]
32
 
33
+ logger.info('Prompt: %s', messages[0]['content'])
34
 
35
  completion_args = {
36
  'model': model_id,
functions/rag.py CHANGED
@@ -24,7 +24,7 @@ def ingest(rag_ingest_queue: queue.Queue) -> None:
24
  namespaces = index.list_namespaces()
25
 
26
  item = rag_ingest_queue.get()
27
- logger.info(item)
28
  title = item['title']
29
 
30
  if title not in namespaces:
@@ -36,15 +36,6 @@ def ingest(rag_ingest_queue: queue.Queue) -> None:
36
  chunks=splitter.chunks(text)
37
 
38
  for i, chunk in enumerate(chunks):
39
- # index.upsert(
40
- # vectors=[
41
- # Vector(
42
- # id=hash(f'{title}-{i}'),
43
- # data=chunk,
44
- # )
45
- # ],
46
- # namespace=title
47
- # )
48
 
49
  index.upsert(
50
  [
 
24
  namespaces = index.list_namespaces()
25
 
26
  item = rag_ingest_queue.get()
27
+ logger.info('Upserting "%s": %s', item['title'], item)
28
  title = item['title']
29
 
30
  if title not in namespaces:
 
36
  chunks=splitter.chunks(text)
37
 
38
  for i, chunk in enumerate(chunks):
 
 
 
 
 
 
 
 
 
39
 
40
  index.upsert(
41
  [
functions/summarization.py CHANGED
@@ -25,7 +25,7 @@ def summarize_content(title: str, content: str) -> str:
25
  logger.info('Summarizing extracted content')
26
 
27
  # Check Redis cache for summary
28
- cache_key = f"{title.lower().replace(' ', '_')}-summary"
29
  cached_summary = REDIS.get(cache_key)
30
 
31
  if cached_summary:
@@ -77,6 +77,8 @@ def summarize_content(title: str, content: str) -> str:
77
  else:
78
  summary = None
79
 
 
80
  REDIS.set(cache_key, summary)
81
  logger.info('Summarized: "%s"', title)
 
82
  return summary
 
25
  logger.info('Summarizing extracted content')
26
 
27
  # Check Redis cache for summary
28
+ cache_key = f'{title} summary'
29
  cached_summary = REDIS.get(cache_key)
30
 
31
  if cached_summary:
 
77
  else:
78
  summary = None
79
 
80
+ # Add the new summary to the cache
81
  REDIS.set(cache_key, summary)
82
  logger.info('Summarized: "%s"', title)
83
+
84
  return summary
functions/tools.py CHANGED
@@ -6,7 +6,7 @@ import time
6
  import json
7
  import logging
8
  import queue
9
- from upstash_vector import Index, Vector
10
 
11
  import functions.feed_extraction as extraction_funcs
12
  import functions.summarization as summarization_funcs
@@ -22,14 +22,17 @@ rag_ingest_thread = threading.Thread(
22
  rag_ingest_thread.start()
23
 
24
 
25
- def get_feed(website: str) -> list:
26
  '''Gets RSS feed content from a given website. Can take a website or RSS
27
  feed URL directly, or the name of a website. Will attempt to find RSS
28
  feed and return title, summary and link to full article for most recent
29
- items in feed.
 
 
30
 
31
  Args:
32
  website: URL or name of website to extract RSS feed content from
 
33
 
34
  Returns:
35
  JSON string containing the feed content or 'No feed found' if a RSS
@@ -50,44 +53,57 @@ def get_feed(website: str) -> list:
50
  return 'No feed found'
51
 
52
  # Parse and extract content from the feed
53
- content = extraction_funcs.parse_feed(feed_uri)
54
- logger.info('parse_feed() returned %s entries', len(list(content.keys())))
55
 
56
- # Summarize each post in the feed and submit full text for RAG ingest
57
- for i, item in content.items():
58
 
 
59
  if item['content'] is not None:
 
60
 
61
- RAG_INGEST_QUEUE.put(item)
 
62
  logger.info('"%s" sent to RAG ingest', item['title'])
63
 
 
64
  summary = summarization_funcs.summarize_content(
65
  item['title'],
66
  item['content']
67
  )
68
 
69
- content[i]['summary'] = summary
70
  logger.info('Summary of "%s" generated', item['title'])
71
 
72
- content[i].pop('content', None)
 
73
 
74
  logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
75
 
76
- return json.dumps(content)
 
77
 
78
 
79
  def context_search(query: str, article_title: str = None) -> str:
80
- '''Searches for context relevant to query in article vector store.
 
 
 
 
 
81
 
82
  Ags:
83
  query: user query to find context for
84
- article_title: optional, use this argument to search only for context
85
- from a specific context
86
 
87
  Returns:
88
- Context which bests matches query as string.
89
  '''
90
 
 
 
91
  index = Index(
92
  url='https://living-whale-89944-us1-vector.upstash.io',
93
  token=os.environ['UPSTASH_VECTOR_KEY']
@@ -103,4 +119,6 @@ def context_search(query: str, article_title: str = None) -> str:
103
  namespace=article_title
104
  )
105
 
 
 
106
  return results
 
6
  import json
7
  import logging
8
  import queue
9
+ from upstash_vector import Index
10
 
11
  import functions.feed_extraction as extraction_funcs
12
  import functions.summarization as summarization_funcs
 
22
  rag_ingest_thread.start()
23
 
24
 
25
+ def get_feed(website: str, n: int = 3) -> list:
26
  '''Gets RSS feed content from a given website. Can take a website or RSS
27
  feed URL directly, or the name of a website. Will attempt to find RSS
28
  feed and return title, summary and link to full article for most recent
29
+ n items in feed. This function is slow a resource heavy, only call it when
30
+ the user wants to check a feed for new content, or asks for content from a
31
+ feed that you have not retrieved yet.
32
 
33
  Args:
34
  website: URL or name of website to extract RSS feed content from
35
+ n: (optional) number of articles to parse from feed, defaults to 3
36
 
37
  Returns:
38
  JSON string containing the feed content or 'No feed found' if a RSS
 
53
  return 'No feed found'
54
 
55
  # Parse and extract content from the feed
56
+ articles = extraction_funcs.parse_feed(feed_uri, n)
57
+ logger.info('parse_feed() returned %s entries', len(list(articles.keys())))
58
 
59
+ # Loop on the posts, sending them to RAG (nonblocking) and summarization (blocking)
60
+ for i, item in articles.items():
61
 
62
+ # Check if content is present
63
  if item['content'] is not None:
64
+ logger.info('Summarizing/RAG ingesting: %s', item)
65
 
66
+ # Send to RAG ingest
67
+ RAG_INGEST_QUEUE.put(item.copy())
68
  logger.info('"%s" sent to RAG ingest', item['title'])
69
 
70
+ # Generate summary and add to content
71
  summary = summarization_funcs.summarize_content(
72
  item['title'],
73
  item['content']
74
  )
75
 
76
+ articles[i]['summary'] = summary
77
  logger.info('Summary of "%s" generated', item['title'])
78
 
79
+ # Remove full-text content before returning
80
+ articles[i].pop('content', None)
81
 
82
  logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
83
 
84
+ # Return content dictionary as string
85
+ return json.dumps(articles)
86
 
87
 
88
  def context_search(query: str, article_title: str = None) -> str:
89
+ '''Searches for context relevant to query in article vector store.
90
+ Use this Function to search for additional information before
91
+ answering the user's question about an article. If article_title is
92
+ provided the search will only return results from that article. If
93
+ article_title is omitted, the search will include all articles
94
+ currently in the cache.
95
 
96
  Ags:
97
  query: user query to find context for
98
+ article_title: optional, use this argument to search only for
99
+ context from a specific context, defaults to None
100
 
101
  Returns:
102
+ Context information which bests matches the query.
103
  '''
104
 
105
+ logger = logging.getLogger(__name__ + 'context_search')
106
+
107
  index = Index(
108
  url='https://living-whale-89944-us1-vector.upstash.io',
109
  token=os.environ['UPSTASH_VECTOR_KEY']
 
119
  namespace=article_title
120
  )
121
 
122
+ logger.info('Retrieved %s chunks for "%s"', len(results), query)
123
+
124
  return results
rss_server.py CHANGED
@@ -1,6 +1,5 @@
1
  '''Main script to run gradio interface and MCP server.'''
2
 
3
- import asyncio
4
  import logging
5
  from pathlib import Path
6
  from logging.handlers import RotatingFileHandler
@@ -10,8 +9,9 @@ import assets.html as html
10
  import functions.tools as tool_funcs
11
  import functions.gradio_functions as gradio_funcs
12
 
13
- # Call the modal container so it spins up
14
- asyncio.run(gradio_funcs.call_modal())
 
15
 
16
  # Set-up logging
17
  # Make sure log directory exists
 
1
  '''Main script to run gradio interface and MCP server.'''
2
 
 
3
  import logging
4
  from pathlib import Path
5
  from logging.handlers import RotatingFileHandler
 
9
  import functions.tools as tool_funcs
10
  import functions.gradio_functions as gradio_funcs
11
 
12
+ # Call the modal container so it spins up before the rest of
13
+ # the app starts
14
+ gradio_funcs.call_modal()
15
 
16
  # Set-up logging
17
  # Make sure log directory exists