Improved caching
Browse files- functions/feed_extraction.py +22 -19
- functions/gradio_functions.py +2 -2
- functions/rag.py +1 -10
- functions/summarization.py +3 -1
- functions/tools.py +33 -15
- rss_server.py +3 -3
functions/feed_extraction.py
CHANGED
@@ -2,7 +2,6 @@
|
|
2 |
|
3 |
import os
|
4 |
import re
|
5 |
-
import json
|
6 |
import logging
|
7 |
import urllib.request
|
8 |
from urllib.error import HTTPError, URLError
|
@@ -53,8 +52,8 @@ def find_feed_uri(website: str) -> str:
|
|
53 |
feed_uri = FEED_URIS[website]
|
54 |
logger.info('%s feed URI in local cache: %s', website, feed_uri)
|
55 |
|
56 |
-
#
|
57 |
-
cache_key = f
|
58 |
cache_hit = False
|
59 |
|
60 |
if feed_uri is None:
|
@@ -65,7 +64,7 @@ def find_feed_uri(website: str) -> str:
|
|
65 |
feed_uri = cached_uri
|
66 |
logger.info('%s feed URI in Redis cache: %s', website, feed_uri)
|
67 |
|
68 |
-
# If none of those get it - try feedparse if it looks like a url
|
69 |
# or else just google it
|
70 |
if feed_uri is None:
|
71 |
if website.split('.')[-1] in COMMON_EXTENSIONS:
|
@@ -79,6 +78,7 @@ def find_feed_uri(website: str) -> str:
|
|
79 |
feed_uri = _get_feed(website_url)
|
80 |
logger.info('get_feed() returned %s', feed_uri)
|
81 |
|
|
|
82 |
FEED_URIS[website] = feed_uri
|
83 |
|
84 |
# Add the feed URI to the redis cache if it wasn't already there
|
@@ -88,14 +88,16 @@ def find_feed_uri(website: str) -> str:
|
|
88 |
return feed_uri
|
89 |
|
90 |
|
91 |
-
def parse_feed(feed_uri: str) -> list:
|
92 |
'''Gets content from a remote RSS feed URI.
|
93 |
|
94 |
Args:
|
95 |
feed_uri: The RSS feed to get content from
|
|
|
96 |
|
97 |
Returns:
|
98 |
-
List of
|
|
|
99 |
'''
|
100 |
|
101 |
logger = logging.getLogger(__name__ + '.parse_feed')
|
@@ -112,16 +114,15 @@ def parse_feed(feed_uri: str) -> list:
|
|
112 |
if 'title' in entry and 'link' in entry:
|
113 |
|
114 |
title = entry.title
|
|
|
115 |
|
116 |
-
# Check the Redis cache
|
117 |
-
|
118 |
-
cache_hit = False
|
119 |
-
cached_entry = REDIS.get(cache_key)
|
120 |
|
121 |
-
if
|
122 |
-
cache_hit = True
|
123 |
-
entry_content = json.loads(cached_entry)
|
124 |
logger.info('Entry in Redis cache: "%s"', title)
|
|
|
|
|
125 |
|
126 |
# If its not in the Redis cache, parse it from the feed data
|
127 |
else:
|
@@ -129,24 +130,26 @@ def parse_feed(feed_uri: str) -> list:
|
|
129 |
entry_content['link'] = entry.link
|
130 |
entry_content['content'] = None
|
131 |
|
|
|
132 |
if 'content' in entry:
|
133 |
entry_content['content'] = entry.content
|
134 |
|
135 |
-
|
|
|
136 |
|
137 |
html = _get_html(entry_content['link'])
|
138 |
content = _get_text(html)
|
139 |
entry_content['content'] = content
|
140 |
|
141 |
-
|
|
|
|
|
142 |
|
143 |
-
|
144 |
-
if cache_hit is False:
|
145 |
-
REDIS.set(cache_key, entry_content)
|
146 |
|
147 |
entries[i] = entry_content
|
148 |
|
149 |
-
if i ==
|
150 |
break
|
151 |
|
152 |
logger.info('Entries contains %s elements', len(list(entries.keys())))
|
|
|
2 |
|
3 |
import os
|
4 |
import re
|
|
|
5 |
import logging
|
6 |
import urllib.request
|
7 |
from urllib.error import HTTPError, URLError
|
|
|
52 |
feed_uri = FEED_URIS[website]
|
53 |
logger.info('%s feed URI in local cache: %s', website, feed_uri)
|
54 |
|
55 |
+
# If we still haven't found it, check to see if the URI is in the Redis cache
|
56 |
+
cache_key = f'{website} feed uri'
|
57 |
cache_hit = False
|
58 |
|
59 |
if feed_uri is None:
|
|
|
64 |
feed_uri = cached_uri
|
65 |
logger.info('%s feed URI in Redis cache: %s', website, feed_uri)
|
66 |
|
67 |
+
# If still none of those methods get it - try feedparse if it looks like a url
|
68 |
# or else just google it
|
69 |
if feed_uri is None:
|
70 |
if website.split('.')[-1] in COMMON_EXTENSIONS:
|
|
|
78 |
feed_uri = _get_feed(website_url)
|
79 |
logger.info('get_feed() returned %s', feed_uri)
|
80 |
|
81 |
+
# Add to local cache
|
82 |
FEED_URIS[website] = feed_uri
|
83 |
|
84 |
# Add the feed URI to the redis cache if it wasn't already there
|
|
|
88 |
return feed_uri
|
89 |
|
90 |
|
91 |
+
def parse_feed(feed_uri: str, n: int) -> list:
|
92 |
'''Gets content from a remote RSS feed URI.
|
93 |
|
94 |
Args:
|
95 |
feed_uri: The RSS feed to get content from
|
96 |
+
n: the number of feed entries to parse
|
97 |
|
98 |
Returns:
|
99 |
+
List of dictionaries for the n most recent entries in the RSS feed.
|
100 |
+
Each dictionary contains 'title', 'link' and 'content' keys.
|
101 |
'''
|
102 |
|
103 |
logger = logging.getLogger(__name__ + '.parse_feed')
|
|
|
114 |
if 'title' in entry and 'link' in entry:
|
115 |
|
116 |
title = entry.title
|
117 |
+
entry_content['title'] = title
|
118 |
|
119 |
+
# Check the Redis cache
|
120 |
+
cached_link = REDIS.get(f'{title} link')
|
|
|
|
|
121 |
|
122 |
+
if cached_link:
|
|
|
|
|
123 |
logger.info('Entry in Redis cache: "%s"', title)
|
124 |
+
entry_content['link'] = cached_link
|
125 |
+
entry_content['content'] = REDIS.get(f'{title} content')
|
126 |
|
127 |
# If its not in the Redis cache, parse it from the feed data
|
128 |
else:
|
|
|
130 |
entry_content['link'] = entry.link
|
131 |
entry_content['content'] = None
|
132 |
|
133 |
+
# Grab the article content from the feed, if provided
|
134 |
if 'content' in entry:
|
135 |
entry_content['content'] = entry.content
|
136 |
|
137 |
+
# If not, try to get the article content from the link
|
138 |
+
elif entry_content['content'] is None:
|
139 |
|
140 |
html = _get_html(entry_content['link'])
|
141 |
content = _get_text(html)
|
142 |
entry_content['content'] = content
|
143 |
|
144 |
+
# Add everything to the cache
|
145 |
+
REDIS.set(f'{title} link', entry_content['link'])
|
146 |
+
REDIS.set(f'{title} content', entry_content['content'])
|
147 |
|
148 |
+
logger.info('Parsed entry: "%s"', title)
|
|
|
|
|
149 |
|
150 |
entries[i] = entry_content
|
151 |
|
152 |
+
if i == n-1:
|
153 |
break
|
154 |
|
155 |
logger.info('Entries contains %s elements', len(list(entries.keys())))
|
functions/gradio_functions.py
CHANGED
@@ -6,7 +6,7 @@ import logging
|
|
6 |
|
7 |
from openai import OpenAI
|
8 |
|
9 |
-
|
10 |
'''Sends request to Modal to spin up container'''
|
11 |
|
12 |
logger = logging.getLogger(__name__ + '.call_modal()')
|
@@ -30,7 +30,7 @@ async def call_modal() -> None:
|
|
30 |
}
|
31 |
]
|
32 |
|
33 |
-
logger.info('Prompt: %s'
|
34 |
|
35 |
completion_args = {
|
36 |
'model': model_id,
|
|
|
6 |
|
7 |
from openai import OpenAI
|
8 |
|
9 |
+
def call_modal() -> None:
|
10 |
'''Sends request to Modal to spin up container'''
|
11 |
|
12 |
logger = logging.getLogger(__name__ + '.call_modal()')
|
|
|
30 |
}
|
31 |
]
|
32 |
|
33 |
+
logger.info('Prompt: %s', messages[0]['content'])
|
34 |
|
35 |
completion_args = {
|
36 |
'model': model_id,
|
functions/rag.py
CHANGED
@@ -24,7 +24,7 @@ def ingest(rag_ingest_queue: queue.Queue) -> None:
|
|
24 |
namespaces = index.list_namespaces()
|
25 |
|
26 |
item = rag_ingest_queue.get()
|
27 |
-
logger.info(item)
|
28 |
title = item['title']
|
29 |
|
30 |
if title not in namespaces:
|
@@ -36,15 +36,6 @@ def ingest(rag_ingest_queue: queue.Queue) -> None:
|
|
36 |
chunks=splitter.chunks(text)
|
37 |
|
38 |
for i, chunk in enumerate(chunks):
|
39 |
-
# index.upsert(
|
40 |
-
# vectors=[
|
41 |
-
# Vector(
|
42 |
-
# id=hash(f'{title}-{i}'),
|
43 |
-
# data=chunk,
|
44 |
-
# )
|
45 |
-
# ],
|
46 |
-
# namespace=title
|
47 |
-
# )
|
48 |
|
49 |
index.upsert(
|
50 |
[
|
|
|
24 |
namespaces = index.list_namespaces()
|
25 |
|
26 |
item = rag_ingest_queue.get()
|
27 |
+
logger.info('Upserting "%s": %s', item['title'], item)
|
28 |
title = item['title']
|
29 |
|
30 |
if title not in namespaces:
|
|
|
36 |
chunks=splitter.chunks(text)
|
37 |
|
38 |
for i, chunk in enumerate(chunks):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
index.upsert(
|
41 |
[
|
functions/summarization.py
CHANGED
@@ -25,7 +25,7 @@ def summarize_content(title: str, content: str) -> str:
|
|
25 |
logger.info('Summarizing extracted content')
|
26 |
|
27 |
# Check Redis cache for summary
|
28 |
-
cache_key = f
|
29 |
cached_summary = REDIS.get(cache_key)
|
30 |
|
31 |
if cached_summary:
|
@@ -77,6 +77,8 @@ def summarize_content(title: str, content: str) -> str:
|
|
77 |
else:
|
78 |
summary = None
|
79 |
|
|
|
80 |
REDIS.set(cache_key, summary)
|
81 |
logger.info('Summarized: "%s"', title)
|
|
|
82 |
return summary
|
|
|
25 |
logger.info('Summarizing extracted content')
|
26 |
|
27 |
# Check Redis cache for summary
|
28 |
+
cache_key = f'{title} summary'
|
29 |
cached_summary = REDIS.get(cache_key)
|
30 |
|
31 |
if cached_summary:
|
|
|
77 |
else:
|
78 |
summary = None
|
79 |
|
80 |
+
# Add the new summary to the cache
|
81 |
REDIS.set(cache_key, summary)
|
82 |
logger.info('Summarized: "%s"', title)
|
83 |
+
|
84 |
return summary
|
functions/tools.py
CHANGED
@@ -6,7 +6,7 @@ import time
|
|
6 |
import json
|
7 |
import logging
|
8 |
import queue
|
9 |
-
from upstash_vector import Index
|
10 |
|
11 |
import functions.feed_extraction as extraction_funcs
|
12 |
import functions.summarization as summarization_funcs
|
@@ -22,14 +22,17 @@ rag_ingest_thread = threading.Thread(
|
|
22 |
rag_ingest_thread.start()
|
23 |
|
24 |
|
25 |
-
def get_feed(website: str) -> list:
|
26 |
'''Gets RSS feed content from a given website. Can take a website or RSS
|
27 |
feed URL directly, or the name of a website. Will attempt to find RSS
|
28 |
feed and return title, summary and link to full article for most recent
|
29 |
-
items in feed.
|
|
|
|
|
30 |
|
31 |
Args:
|
32 |
website: URL or name of website to extract RSS feed content from
|
|
|
33 |
|
34 |
Returns:
|
35 |
JSON string containing the feed content or 'No feed found' if a RSS
|
@@ -50,44 +53,57 @@ def get_feed(website: str) -> list:
|
|
50 |
return 'No feed found'
|
51 |
|
52 |
# Parse and extract content from the feed
|
53 |
-
|
54 |
-
logger.info('parse_feed() returned %s entries', len(list(
|
55 |
|
56 |
-
#
|
57 |
-
for i, item in
|
58 |
|
|
|
59 |
if item['content'] is not None:
|
|
|
60 |
|
61 |
-
|
|
|
62 |
logger.info('"%s" sent to RAG ingest', item['title'])
|
63 |
|
|
|
64 |
summary = summarization_funcs.summarize_content(
|
65 |
item['title'],
|
66 |
item['content']
|
67 |
)
|
68 |
|
69 |
-
|
70 |
logger.info('Summary of "%s" generated', item['title'])
|
71 |
|
72 |
-
content
|
|
|
73 |
|
74 |
logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
|
75 |
|
76 |
-
|
|
|
77 |
|
78 |
|
79 |
def context_search(query: str, article_title: str = None) -> str:
|
80 |
-
'''Searches for context relevant to query in article vector store.
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
Ags:
|
83 |
query: user query to find context for
|
84 |
-
article_title: optional, use this argument to search only for
|
85 |
-
|
86 |
|
87 |
Returns:
|
88 |
-
Context which bests matches query
|
89 |
'''
|
90 |
|
|
|
|
|
91 |
index = Index(
|
92 |
url='https://living-whale-89944-us1-vector.upstash.io',
|
93 |
token=os.environ['UPSTASH_VECTOR_KEY']
|
@@ -103,4 +119,6 @@ def context_search(query: str, article_title: str = None) -> str:
|
|
103 |
namespace=article_title
|
104 |
)
|
105 |
|
|
|
|
|
106 |
return results
|
|
|
6 |
import json
|
7 |
import logging
|
8 |
import queue
|
9 |
+
from upstash_vector import Index
|
10 |
|
11 |
import functions.feed_extraction as extraction_funcs
|
12 |
import functions.summarization as summarization_funcs
|
|
|
22 |
rag_ingest_thread.start()
|
23 |
|
24 |
|
25 |
+
def get_feed(website: str, n: int = 3) -> list:
|
26 |
'''Gets RSS feed content from a given website. Can take a website or RSS
|
27 |
feed URL directly, or the name of a website. Will attempt to find RSS
|
28 |
feed and return title, summary and link to full article for most recent
|
29 |
+
n items in feed. This function is slow a resource heavy, only call it when
|
30 |
+
the user wants to check a feed for new content, or asks for content from a
|
31 |
+
feed that you have not retrieved yet.
|
32 |
|
33 |
Args:
|
34 |
website: URL or name of website to extract RSS feed content from
|
35 |
+
n: (optional) number of articles to parse from feed, defaults to 3
|
36 |
|
37 |
Returns:
|
38 |
JSON string containing the feed content or 'No feed found' if a RSS
|
|
|
53 |
return 'No feed found'
|
54 |
|
55 |
# Parse and extract content from the feed
|
56 |
+
articles = extraction_funcs.parse_feed(feed_uri, n)
|
57 |
+
logger.info('parse_feed() returned %s entries', len(list(articles.keys())))
|
58 |
|
59 |
+
# Loop on the posts, sending them to RAG (nonblocking) and summarization (blocking)
|
60 |
+
for i, item in articles.items():
|
61 |
|
62 |
+
# Check if content is present
|
63 |
if item['content'] is not None:
|
64 |
+
logger.info('Summarizing/RAG ingesting: %s', item)
|
65 |
|
66 |
+
# Send to RAG ingest
|
67 |
+
RAG_INGEST_QUEUE.put(item.copy())
|
68 |
logger.info('"%s" sent to RAG ingest', item['title'])
|
69 |
|
70 |
+
# Generate summary and add to content
|
71 |
summary = summarization_funcs.summarize_content(
|
72 |
item['title'],
|
73 |
item['content']
|
74 |
)
|
75 |
|
76 |
+
articles[i]['summary'] = summary
|
77 |
logger.info('Summary of "%s" generated', item['title'])
|
78 |
|
79 |
+
# Remove full-text content before returning
|
80 |
+
articles[i].pop('content', None)
|
81 |
|
82 |
logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
|
83 |
|
84 |
+
# Return content dictionary as string
|
85 |
+
return json.dumps(articles)
|
86 |
|
87 |
|
88 |
def context_search(query: str, article_title: str = None) -> str:
|
89 |
+
'''Searches for context relevant to query in article vector store.
|
90 |
+
Use this Function to search for additional information before
|
91 |
+
answering the user's question about an article. If article_title is
|
92 |
+
provided the search will only return results from that article. If
|
93 |
+
article_title is omitted, the search will include all articles
|
94 |
+
currently in the cache.
|
95 |
|
96 |
Ags:
|
97 |
query: user query to find context for
|
98 |
+
article_title: optional, use this argument to search only for
|
99 |
+
context from a specific context, defaults to None
|
100 |
|
101 |
Returns:
|
102 |
+
Context information which bests matches the query.
|
103 |
'''
|
104 |
|
105 |
+
logger = logging.getLogger(__name__ + 'context_search')
|
106 |
+
|
107 |
index = Index(
|
108 |
url='https://living-whale-89944-us1-vector.upstash.io',
|
109 |
token=os.environ['UPSTASH_VECTOR_KEY']
|
|
|
119 |
namespace=article_title
|
120 |
)
|
121 |
|
122 |
+
logger.info('Retrieved %s chunks for "%s"', len(results), query)
|
123 |
+
|
124 |
return results
|
rss_server.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
'''Main script to run gradio interface and MCP server.'''
|
2 |
|
3 |
-
import asyncio
|
4 |
import logging
|
5 |
from pathlib import Path
|
6 |
from logging.handlers import RotatingFileHandler
|
@@ -10,8 +9,9 @@ import assets.html as html
|
|
10 |
import functions.tools as tool_funcs
|
11 |
import functions.gradio_functions as gradio_funcs
|
12 |
|
13 |
-
# Call the modal container so it spins up
|
14 |
-
|
|
|
15 |
|
16 |
# Set-up logging
|
17 |
# Make sure log directory exists
|
|
|
1 |
'''Main script to run gradio interface and MCP server.'''
|
2 |
|
|
|
3 |
import logging
|
4 |
from pathlib import Path
|
5 |
from logging.handlers import RotatingFileHandler
|
|
|
9 |
import functions.tools as tool_funcs
|
10 |
import functions.gradio_functions as gradio_funcs
|
11 |
|
12 |
+
# Call the modal container so it spins up before the rest of
|
13 |
+
# the app starts
|
14 |
+
gradio_funcs.call_modal()
|
15 |
|
16 |
# Set-up logging
|
17 |
# Make sure log directory exists
|