gperdrizet commited on
Commit
dc68696
·
verified ·
1 Parent(s): 663acf0

Added Redis caching for feed content and article summaries.

Browse files
functions/feed_extraction.py CHANGED
@@ -2,6 +2,7 @@
2
 
3
  import os
4
  import re
 
5
  import logging
6
  import urllib.request
7
  from urllib.error import HTTPError, URLError
@@ -110,28 +111,38 @@ def parse_feed(feed_uri: str) -> list:
110
 
111
  if 'title' in entry and 'link' in entry:
112
 
113
- entry_content['title'] = entry.title
114
- entry_content['link'] = entry.link
115
 
116
- # entry_content['updated'] = None
117
- # entry_content['summary'] = None
118
- entry_content['content'] = None
 
119
 
120
- # if 'updated' in entry:
121
- # entry_content['updated'] = entry.updated
 
 
122
 
123
- # if 'summary' in entry:
124
- # summary = _get_text(entry.summary)
125
- # entry_content['summary'] = summary
 
 
 
 
 
 
 
126
 
127
- if 'content' in entry:
128
- entry_content['content'] = entry.content
 
129
 
130
- if entry_content['content'] is None:
131
 
132
- html = _get_html(entry_content['link'])
133
- content = _get_text(html)
134
- entry_content['content'] = content
135
 
136
  entries[i] = entry_content
137
 
 
2
 
3
  import os
4
  import re
5
+ import json
6
  import logging
7
  import urllib.request
8
  from urllib.error import HTTPError, URLError
 
111
 
112
  if 'title' in entry and 'link' in entry:
113
 
114
+ title = entry.title
 
115
 
116
+ # Check the Redis cache for this entry
117
+ cache_key = title.lower().replace(' ', '_')
118
+ cache_hit = False
119
+ cached_entry = REDIS.get(cache_key)
120
 
121
+ if cached_entry:
122
+ cache_hit = True
123
+ entry_content = json.loads(cached_entry)
124
+ logger.info('Entry in Redis cache: "%s"', title)
125
 
126
+ # If its not in the Redis cache, parse it from the feed data
127
+ else:
128
+ entry_content['title'] = entry.title
129
+ entry_content['link'] = entry.link
130
+ entry_content['content'] = None
131
+
132
+ if 'content' in entry:
133
+ entry_content['content'] = entry.content
134
+
135
+ if entry_content['content'] is None:
136
 
137
+ html = _get_html(entry_content['link'])
138
+ content = _get_text(html)
139
+ entry_content['content'] = content
140
 
141
+ logger.info('Parsed entry: "%s"', title)
142
 
143
+ # Add it to the Redis cache if it wasn't there
144
+ if cache_hit is False:
145
+ REDIS.set(cache_key, entry_content)
146
 
147
  entries[i] = entry_content
148
 
functions/summarization.py CHANGED
@@ -4,9 +4,14 @@ import os
4
  import logging
5
 
6
  from openai import OpenAI
 
7
 
 
 
 
 
8
 
9
- def summarize_content(content: str) -> str:
10
  '''Generates summary of article content using Modal inference endpoint.
11
 
12
  Args:
@@ -19,6 +24,15 @@ def summarize_content(content: str) -> str:
19
  logger = logging.getLogger(__name__ + '.summarize_content')
20
  logger.info('Summarizing extracted content')
21
 
 
 
 
 
 
 
 
 
 
22
  client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
23
 
24
  client.base_url = (
@@ -29,16 +43,6 @@ def summarize_content(content: str) -> str:
29
  model = client.models.list().data[0]
30
  model_id = model.id
31
 
32
- # messages = [
33
- # {
34
- # 'role': 'system',
35
- # 'content': ('You are a research assistant, skilled in summarizing documents in just '+
36
- # 'a few sentences. Your document summaries should be a maximum of 2 to 4 sentences long.'),
37
- # 'role': 'user',
38
- # 'content': content
39
- # }
40
- # ]
41
-
42
  messages = [
43
  {
44
  'role': 'system',
@@ -68,7 +72,11 @@ def summarize_content(content: str) -> str:
68
  logger.error('Error during Modal API call: %s', e)
69
 
70
  if response is not None:
71
- return response.choices[0].message.content
72
 
73
  else:
74
- return None
 
 
 
 
 
4
  import logging
5
 
6
  from openai import OpenAI
7
+ from upstash_redis import Redis
8
 
9
+ REDIS = Redis(
10
+ url='https://sensible-midge-19304.upstash.io',
11
+ token=os.environ['UPSTASH_REDIS_KEY']
12
+ )
13
 
14
+ def summarize_content(title: str, content: str) -> str:
15
  '''Generates summary of article content using Modal inference endpoint.
16
 
17
  Args:
 
24
  logger = logging.getLogger(__name__ + '.summarize_content')
25
  logger.info('Summarizing extracted content')
26
 
27
+ # Check Redis cache for summary
28
+ cache_key = f"{title.lower().replace(' ', '_')}-summary"
29
+ cached_summary = REDIS.get(cache_key)
30
+
31
+ if cached_summary:
32
+ logger.info('Got summary from Redis cache: "%s"', title)
33
+ return cached_summary
34
+
35
+ # It the summary is not in the cache, generate it
36
  client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
37
 
38
  client.base_url = (
 
43
  model = client.models.list().data[0]
44
  model_id = model.id
45
 
 
 
 
 
 
 
 
 
 
 
46
  messages = [
47
  {
48
  'role': 'system',
 
72
  logger.error('Error during Modal API call: %s', e)
73
 
74
  if response is not None:
75
+ summary = response.choices[0].message.content
76
 
77
  else:
78
+ summary = None
79
+
80
+ REDIS.set(cache_key, summary)
81
+ logger.info('Summarized: "%s"', title)
82
+ return summary
functions/tools.py CHANGED
@@ -6,20 +6,15 @@ import logging
6
  import functions.feed_extraction as extraction_funcs
7
  import functions.summarization as summarization_funcs
8
 
9
- LOCAL_CACHE = {
10
- 'get_feed': {}
11
- }
12
 
13
- def get_feed(website: str, use_cache: bool = True) -> list:
14
  '''Gets RSS feed content from a given website. Can take a website or RSS
15
  feed URL directly, or the name of a website. Will attempt to find RSS
16
  feed and return title, summary and link to full article for most recent
17
- items in feed
18
 
19
  Args:
20
  website: URL or name of website to extract RSS feed content from
21
- use_cache: check local cache for content from RSS feed first before
22
- downloading data from the website's RSS feed
23
 
24
  Returns:
25
  JSON string containing the feed content or 'No feed found' if a RSS
@@ -31,35 +26,29 @@ def get_feed(website: str, use_cache: bool = True) -> list:
31
  logger = logging.getLogger(__name__ + '.get_feed()')
32
  logger.info('Getting feed content for: %s', website)
33
 
34
- # Check to see if we have this feed cached, if desired
35
- if use_cache is True and website in LOCAL_CACHE['get_feed']:
36
- content = LOCAL_CACHE['get_feed'][website]
37
- logger.info('Got feed content from local cache')
38
 
39
- else:
 
 
40
 
41
- # Find the feed's URI from the website name/URL
42
- feed_uri = extraction_funcs.find_feed_uri(website)
43
- logger.info('find_feed_uri() returned %s', feed_uri)
44
 
45
- if 'No feed found' in feed_uri:
46
- logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
47
- return 'No feed found'
48
 
49
- # Parse and extract content from the feed
50
- content = extraction_funcs.parse_feed(feed_uri)
51
- logger.info('parse_feed() returned %s entries', len(list(content.keys())))
 
 
 
52
 
53
- # Summarize each post in the feed
54
- for i, item in content.items():
55
-
56
- if item['content'] is not None:
57
- summary = summarization_funcs.summarize_content(item['content'])
58
- content[i]['summary'] = summary
59
-
60
- content[i].pop('content', None)
61
-
62
- LOCAL_CACHE['get_feed'][website] = content
63
 
64
  logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
65
 
 
6
  import functions.feed_extraction as extraction_funcs
7
  import functions.summarization as summarization_funcs
8
 
 
 
 
9
 
10
+ def get_feed(website: str) -> list:
11
  '''Gets RSS feed content from a given website. Can take a website or RSS
12
  feed URL directly, or the name of a website. Will attempt to find RSS
13
  feed and return title, summary and link to full article for most recent
14
+ items in feed.
15
 
16
  Args:
17
  website: URL or name of website to extract RSS feed content from
 
 
18
 
19
  Returns:
20
  JSON string containing the feed content or 'No feed found' if a RSS
 
26
  logger = logging.getLogger(__name__ + '.get_feed()')
27
  logger.info('Getting feed content for: %s', website)
28
 
29
+ # Find the feed's URI from the website name/URL
30
+ feed_uri = extraction_funcs.find_feed_uri(website)
31
+ logger.info('find_feed_uri() returned %s', feed_uri)
 
32
 
33
+ if 'No feed found' in feed_uri:
34
+ logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
35
+ return 'No feed found'
36
 
37
+ # Parse and extract content from the feed
38
+ content = extraction_funcs.parse_feed(feed_uri)
39
+ logger.info('parse_feed() returned %s entries', len(list(content.keys())))
40
 
41
+ # Summarize each post in the feed
42
+ for i, item in content.items():
 
43
 
44
+ if item['content'] is not None:
45
+ summary = summarization_funcs.summarize_content(
46
+ item['title'],
47
+ item['content']
48
+ )
49
+ content[i]['summary'] = summary
50
 
51
+ content[i].pop('content', None)
 
 
 
 
 
 
 
 
 
52
 
53
  logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
54