Added Redis caching for feed content and article summaries.
Browse files- functions/feed_extraction.py +27 -16
- functions/summarization.py +21 -13
- functions/tools.py +20 -31
functions/feed_extraction.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
|
3 |
import os
|
4 |
import re
|
|
|
5 |
import logging
|
6 |
import urllib.request
|
7 |
from urllib.error import HTTPError, URLError
|
@@ -110,28 +111,38 @@ def parse_feed(feed_uri: str) -> list:
|
|
110 |
|
111 |
if 'title' in entry and 'link' in entry:
|
112 |
|
113 |
-
|
114 |
-
entry_content['link'] = entry.link
|
115 |
|
116 |
-
#
|
117 |
-
|
118 |
-
|
|
|
119 |
|
120 |
-
|
121 |
-
|
|
|
|
|
122 |
|
123 |
-
#
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
-
|
128 |
-
|
|
|
129 |
|
130 |
-
|
131 |
|
132 |
-
|
133 |
-
|
134 |
-
entry_content
|
135 |
|
136 |
entries[i] = entry_content
|
137 |
|
|
|
2 |
|
3 |
import os
|
4 |
import re
|
5 |
+
import json
|
6 |
import logging
|
7 |
import urllib.request
|
8 |
from urllib.error import HTTPError, URLError
|
|
|
111 |
|
112 |
if 'title' in entry and 'link' in entry:
|
113 |
|
114 |
+
title = entry.title
|
|
|
115 |
|
116 |
+
# Check the Redis cache for this entry
|
117 |
+
cache_key = title.lower().replace(' ', '_')
|
118 |
+
cache_hit = False
|
119 |
+
cached_entry = REDIS.get(cache_key)
|
120 |
|
121 |
+
if cached_entry:
|
122 |
+
cache_hit = True
|
123 |
+
entry_content = json.loads(cached_entry)
|
124 |
+
logger.info('Entry in Redis cache: "%s"', title)
|
125 |
|
126 |
+
# If its not in the Redis cache, parse it from the feed data
|
127 |
+
else:
|
128 |
+
entry_content['title'] = entry.title
|
129 |
+
entry_content['link'] = entry.link
|
130 |
+
entry_content['content'] = None
|
131 |
+
|
132 |
+
if 'content' in entry:
|
133 |
+
entry_content['content'] = entry.content
|
134 |
+
|
135 |
+
if entry_content['content'] is None:
|
136 |
|
137 |
+
html = _get_html(entry_content['link'])
|
138 |
+
content = _get_text(html)
|
139 |
+
entry_content['content'] = content
|
140 |
|
141 |
+
logger.info('Parsed entry: "%s"', title)
|
142 |
|
143 |
+
# Add it to the Redis cache if it wasn't there
|
144 |
+
if cache_hit is False:
|
145 |
+
REDIS.set(cache_key, entry_content)
|
146 |
|
147 |
entries[i] = entry_content
|
148 |
|
functions/summarization.py
CHANGED
@@ -4,9 +4,14 @@ import os
|
|
4 |
import logging
|
5 |
|
6 |
from openai import OpenAI
|
|
|
7 |
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
def summarize_content(content: str) -> str:
|
10 |
'''Generates summary of article content using Modal inference endpoint.
|
11 |
|
12 |
Args:
|
@@ -19,6 +24,15 @@ def summarize_content(content: str) -> str:
|
|
19 |
logger = logging.getLogger(__name__ + '.summarize_content')
|
20 |
logger.info('Summarizing extracted content')
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
|
23 |
|
24 |
client.base_url = (
|
@@ -29,16 +43,6 @@ def summarize_content(content: str) -> str:
|
|
29 |
model = client.models.list().data[0]
|
30 |
model_id = model.id
|
31 |
|
32 |
-
# messages = [
|
33 |
-
# {
|
34 |
-
# 'role': 'system',
|
35 |
-
# 'content': ('You are a research assistant, skilled in summarizing documents in just '+
|
36 |
-
# 'a few sentences. Your document summaries should be a maximum of 2 to 4 sentences long.'),
|
37 |
-
# 'role': 'user',
|
38 |
-
# 'content': content
|
39 |
-
# }
|
40 |
-
# ]
|
41 |
-
|
42 |
messages = [
|
43 |
{
|
44 |
'role': 'system',
|
@@ -68,7 +72,11 @@ def summarize_content(content: str) -> str:
|
|
68 |
logger.error('Error during Modal API call: %s', e)
|
69 |
|
70 |
if response is not None:
|
71 |
-
|
72 |
|
73 |
else:
|
74 |
-
|
|
|
|
|
|
|
|
|
|
4 |
import logging
|
5 |
|
6 |
from openai import OpenAI
|
7 |
+
from upstash_redis import Redis
|
8 |
|
9 |
+
REDIS = Redis(
|
10 |
+
url='https://sensible-midge-19304.upstash.io',
|
11 |
+
token=os.environ['UPSTASH_REDIS_KEY']
|
12 |
+
)
|
13 |
|
14 |
+
def summarize_content(title: str, content: str) -> str:
|
15 |
'''Generates summary of article content using Modal inference endpoint.
|
16 |
|
17 |
Args:
|
|
|
24 |
logger = logging.getLogger(__name__ + '.summarize_content')
|
25 |
logger.info('Summarizing extracted content')
|
26 |
|
27 |
+
# Check Redis cache for summary
|
28 |
+
cache_key = f"{title.lower().replace(' ', '_')}-summary"
|
29 |
+
cached_summary = REDIS.get(cache_key)
|
30 |
+
|
31 |
+
if cached_summary:
|
32 |
+
logger.info('Got summary from Redis cache: "%s"', title)
|
33 |
+
return cached_summary
|
34 |
+
|
35 |
+
# It the summary is not in the cache, generate it
|
36 |
client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
|
37 |
|
38 |
client.base_url = (
|
|
|
43 |
model = client.models.list().data[0]
|
44 |
model_id = model.id
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
messages = [
|
47 |
{
|
48 |
'role': 'system',
|
|
|
72 |
logger.error('Error during Modal API call: %s', e)
|
73 |
|
74 |
if response is not None:
|
75 |
+
summary = response.choices[0].message.content
|
76 |
|
77 |
else:
|
78 |
+
summary = None
|
79 |
+
|
80 |
+
REDIS.set(cache_key, summary)
|
81 |
+
logger.info('Summarized: "%s"', title)
|
82 |
+
return summary
|
functions/tools.py
CHANGED
@@ -6,20 +6,15 @@ import logging
|
|
6 |
import functions.feed_extraction as extraction_funcs
|
7 |
import functions.summarization as summarization_funcs
|
8 |
|
9 |
-
LOCAL_CACHE = {
|
10 |
-
'get_feed': {}
|
11 |
-
}
|
12 |
|
13 |
-
def get_feed(website: str
|
14 |
'''Gets RSS feed content from a given website. Can take a website or RSS
|
15 |
feed URL directly, or the name of a website. Will attempt to find RSS
|
16 |
feed and return title, summary and link to full article for most recent
|
17 |
-
items in feed
|
18 |
|
19 |
Args:
|
20 |
website: URL or name of website to extract RSS feed content from
|
21 |
-
use_cache: check local cache for content from RSS feed first before
|
22 |
-
downloading data from the website's RSS feed
|
23 |
|
24 |
Returns:
|
25 |
JSON string containing the feed content or 'No feed found' if a RSS
|
@@ -31,35 +26,29 @@ def get_feed(website: str, use_cache: bool = True) -> list:
|
|
31 |
logger = logging.getLogger(__name__ + '.get_feed()')
|
32 |
logger.info('Getting feed content for: %s', website)
|
33 |
|
34 |
-
#
|
35 |
-
|
36 |
-
|
37 |
-
logger.info('Got feed content from local cache')
|
38 |
|
39 |
-
|
|
|
|
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
return 'No feed found'
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
52 |
|
53 |
-
|
54 |
-
for i, item in content.items():
|
55 |
-
|
56 |
-
if item['content'] is not None:
|
57 |
-
summary = summarization_funcs.summarize_content(item['content'])
|
58 |
-
content[i]['summary'] = summary
|
59 |
-
|
60 |
-
content[i].pop('content', None)
|
61 |
-
|
62 |
-
LOCAL_CACHE['get_feed'][website] = content
|
63 |
|
64 |
logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
|
65 |
|
|
|
6 |
import functions.feed_extraction as extraction_funcs
|
7 |
import functions.summarization as summarization_funcs
|
8 |
|
|
|
|
|
|
|
9 |
|
10 |
+
def get_feed(website: str) -> list:
|
11 |
'''Gets RSS feed content from a given website. Can take a website or RSS
|
12 |
feed URL directly, or the name of a website. Will attempt to find RSS
|
13 |
feed and return title, summary and link to full article for most recent
|
14 |
+
items in feed.
|
15 |
|
16 |
Args:
|
17 |
website: URL or name of website to extract RSS feed content from
|
|
|
|
|
18 |
|
19 |
Returns:
|
20 |
JSON string containing the feed content or 'No feed found' if a RSS
|
|
|
26 |
logger = logging.getLogger(__name__ + '.get_feed()')
|
27 |
logger.info('Getting feed content for: %s', website)
|
28 |
|
29 |
+
# Find the feed's URI from the website name/URL
|
30 |
+
feed_uri = extraction_funcs.find_feed_uri(website)
|
31 |
+
logger.info('find_feed_uri() returned %s', feed_uri)
|
|
|
32 |
|
33 |
+
if 'No feed found' in feed_uri:
|
34 |
+
logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
|
35 |
+
return 'No feed found'
|
36 |
|
37 |
+
# Parse and extract content from the feed
|
38 |
+
content = extraction_funcs.parse_feed(feed_uri)
|
39 |
+
logger.info('parse_feed() returned %s entries', len(list(content.keys())))
|
40 |
|
41 |
+
# Summarize each post in the feed
|
42 |
+
for i, item in content.items():
|
|
|
43 |
|
44 |
+
if item['content'] is not None:
|
45 |
+
summary = summarization_funcs.summarize_content(
|
46 |
+
item['title'],
|
47 |
+
item['content']
|
48 |
+
)
|
49 |
+
content[i]['summary'] = summary
|
50 |
|
51 |
+
content[i].pop('content', None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
|
54 |
|