gperdrizet commited on
Commit
6c67813
·
verified ·
1 Parent(s): b72c075

Added caching of feed content and URI.

Browse files
Files changed (3) hide show
  1. functions/feed_extraction.py +27 -5
  2. functions/tools.py +36 -13
  3. rss_server.py +3 -2
functions/feed_extraction.py CHANGED
@@ -1,6 +1,8 @@
1
  '''Helper functions for MCP tools.'''
2
 
 
3
  import re
 
4
  import logging
5
  import urllib.request
6
  from urllib.error import HTTPError, URLError
@@ -10,11 +12,15 @@ from boilerpy3 import extractors
10
  from boilerpy3.exceptions import HTMLExtractionError
11
  from findfeed import search as feed_search
12
  from googlesearch import search as google_search
 
13
 
14
  FEED_URIS = {}
15
  RSS_EXTENSIONS = ['xml', 'rss', 'atom']
16
  COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
17
-
 
 
 
18
 
19
  def find_feed_uri(website: str) -> str:
20
  '''Attempts to find URI for RSS feed. First checks if string provided in
@@ -42,14 +48,26 @@ def find_feed_uri(website: str) -> str:
42
  feed_uri = website
43
  logger.info('%s looks like a feed URI already - using it directly', website)
44
 
45
- # Next, check the cache to see if we already have this feed's URI
46
  elif website in FEED_URIS:
47
  feed_uri = FEED_URIS[website]
48
- logger.info('%s feed URI in cache: %s', website, feed_uri)
 
 
 
 
 
 
 
49
 
50
- # If neither of those get it - try feedparse if it looks like a url
 
 
 
 
 
51
  # or else just google it
52
- else:
53
  if website.split('.')[-1] in COMMON_EXTENSIONS:
54
  website_url = website
55
  logger.info('%s looks like a website URL', website)
@@ -63,6 +81,10 @@ def find_feed_uri(website: str) -> str:
63
 
64
  FEED_URIS[website] = feed_uri
65
 
 
 
 
 
66
  return feed_uri
67
 
68
 
 
1
  '''Helper functions for MCP tools.'''
2
 
3
+ import os
4
  import re
5
+ import json
6
  import logging
7
  import urllib.request
8
  from urllib.error import HTTPError, URLError
 
12
  from boilerpy3.exceptions import HTMLExtractionError
13
  from findfeed import search as feed_search
14
  from googlesearch import search as google_search
15
+ from upstash_redis import Redis
16
 
17
  FEED_URIS = {}
18
  RSS_EXTENSIONS = ['xml', 'rss', 'atom']
19
  COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
20
+ REDIS = Redis(
21
+ url='https://sensible-midge-19304.upstash.io',
22
+ token=os.environ['UPSTASH_KEY']
23
+ )
24
 
25
  def find_feed_uri(website: str) -> str:
26
  '''Attempts to find URI for RSS feed. First checks if string provided in
 
48
  feed_uri = website
49
  logger.info('%s looks like a feed URI already - using it directly', website)
50
 
51
+ # Next, check the cache to see if we already have this feed's URI locally
52
  elif website in FEED_URIS:
53
  feed_uri = FEED_URIS[website]
54
+ logger.info('%s feed URI in local cache: %s', website, feed_uri)
55
+
56
+ # Then, check to see if the URI is in the Redis cache
57
+ cache_key = f"{website.lower().replace(' ', '_')}-feed-uri"
58
+ cache_hit = False
59
+
60
+ if feed_uri is None:
61
+ cached_uri = REDIS.get(cache_key)
62
 
63
+ if cached_uri:
64
+ cache_hit = True
65
+ feed_uri = cached_uri
66
+ logger.info('%s feed URI in Redis cache: %s', website, feed_uri)
67
+
68
+ # If none of those get it - try feedparse if it looks like a url
69
  # or else just google it
70
+ if feed_uri is None:
71
  if website.split('.')[-1] in COMMON_EXTENSIONS:
72
  website_url = website
73
  logger.info('%s looks like a website URL', website)
 
81
 
82
  FEED_URIS[website] = feed_uri
83
 
84
+ # Add the feed URI to the redis cache if it wasn't already there
85
+ if cache_hit is False:
86
+ REDIS.set(cache_key, feed_uri)
87
+
88
  return feed_uri
89
 
90
 
functions/tools.py CHANGED
@@ -1,12 +1,16 @@
1
  '''Tool functions for MCP server'''
2
 
 
3
  import json
4
  import logging
5
  import functions.feed_extraction as extraction_funcs
6
  import functions.summarization as summarization_funcs
7
 
 
 
 
8
 
9
- def get_feed(website: str) -> list:
10
  '''Gets RSS feed content from a given website. Can take a website or RSS
11
  feed URL directly, or the name of a website. Will attempt to find RSS
12
  feed and return title, summary and link to full article for most recent
@@ -14,30 +18,49 @@ def get_feed(website: str) -> list:
14
 
15
  Args:
16
  website: URL or name of website to extract RSS feed content from
 
 
17
 
18
  Returns:
19
  JSON string containing the feed content or 'No feed found' if a RSS
20
  feed for the requested website could not be found
21
  '''
22
 
23
- logger = logging.getLogger(__name__ + '.get_content')
 
 
24
  logger.info('Getting feed content for: %s', website)
25
 
26
- feed_uri = extraction_funcs.find_feed_uri(website)
27
- logger.info('find_feed_uri() returned %s', feed_uri)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- if 'No feed found' in feed_uri:
30
- return 'No feed found'
31
 
32
- content = extraction_funcs.parse_feed(feed_uri)
33
- logger.info('parse_feed() returned %s entries', len(list(content.keys())))
 
34
 
35
- for i, item in content.items():
36
 
37
- if item['content'] is not None:
38
- summary = summarization_funcs.summarize_content(item['content'])
39
- content[i]['summary'] = summary
40
 
41
- content[i].pop('content', None)
42
 
43
  return json.dumps(content)
 
1
  '''Tool functions for MCP server'''
2
 
3
+ import time
4
  import json
5
  import logging
6
  import functions.feed_extraction as extraction_funcs
7
  import functions.summarization as summarization_funcs
8
 
9
+ LOCAL_CACHE = {
10
+ 'get_feed': {}
11
+ }
12
 
13
+ def get_feed(website: str, use_cache: bool = True) -> list:
14
  '''Gets RSS feed content from a given website. Can take a website or RSS
15
  feed URL directly, or the name of a website. Will attempt to find RSS
16
  feed and return title, summary and link to full article for most recent
 
18
 
19
  Args:
20
  website: URL or name of website to extract RSS feed content from
21
+ use_cache: check local cache for content from RSS feed first before
22
+ downloading data from the website's RSS feed
23
 
24
  Returns:
25
  JSON string containing the feed content or 'No feed found' if a RSS
26
  feed for the requested website could not be found
27
  '''
28
 
29
+ start_time = time.time()
30
+
31
+ logger = logging.getLogger(__name__ + '.get_feed()')
32
  logger.info('Getting feed content for: %s', website)
33
 
34
+ # Check to see if we have this feed cached, if desired
35
+ if use_cache is True and website in LOCAL_CACHE['get_feed']:
36
+ content = LOCAL_CACHE['get_feed'][website]
37
+ logger.info('Got feed content from local cache')
38
+
39
+ else:
40
+
41
+ # Find the feed's URI from the website name/URL
42
+ feed_uri = extraction_funcs.find_feed_uri(website)
43
+ logger.info('find_feed_uri() returned %s', feed_uri)
44
+
45
+ if 'No feed found' in feed_uri:
46
+ logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
47
+ return 'No feed found'
48
+
49
+ # Parse and extract content from the feed
50
+ content = extraction_funcs.parse_feed(feed_uri)
51
+ logger.info('parse_feed() returned %s entries', len(list(content.keys())))
52
 
53
+ # Summarize each post in the feed
54
+ for i, item in content.items():
55
 
56
+ if item['content'] is not None:
57
+ summary = summarization_funcs.summarize_content(item['content'])
58
+ content[i]['summary'] = summary
59
 
60
+ content[i].pop('content', None)
61
 
62
+ LOCAL_CACHE['get_feed'][website] = content
 
 
63
 
64
+ logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
65
 
66
  return json.dumps(content)
rss_server.py CHANGED
@@ -1,6 +1,7 @@
1
  '''Main script to run gradio interface and MCP server.'''
2
 
3
  import logging
 
4
  from pathlib import Path
5
  from logging.handlers import RotatingFileHandler
6
 
@@ -17,7 +18,7 @@ Path('logs').mkdir(parents=True, exist_ok=True)
17
  # Clear old logs if present
18
  gradio_funcs.delete_old_logs('logs', 'rss_server')
19
 
20
- # Set up the root logger so we catch logs from
21
  logging.basicConfig(
22
  handlers=[RotatingFileHandler(
23
  'logs/rss_server.log',
@@ -29,9 +30,9 @@ logging.basicConfig(
29
  format='%(levelname)s - %(name)s - %(message)s'
30
  )
31
 
 
32
  logger = logging.getLogger(__name__)
33
 
34
-
35
  with gr.Blocks() as demo:
36
 
37
  # Page text
 
1
  '''Main script to run gradio interface and MCP server.'''
2
 
3
  import logging
4
+ from functools import partial
5
  from pathlib import Path
6
  from logging.handlers import RotatingFileHandler
7
 
 
18
  # Clear old logs if present
19
  gradio_funcs.delete_old_logs('logs', 'rss_server')
20
 
21
+ # Set up the root logger so we catch logs from everything
22
  logging.basicConfig(
23
  handlers=[RotatingFileHandler(
24
  'logs/rss_server.log',
 
30
  format='%(levelname)s - %(name)s - %(message)s'
31
  )
32
 
33
+ # Get a logger
34
  logger = logging.getLogger(__name__)
35
 
 
36
  with gr.Blocks() as demo:
37
 
38
  # Page text