Spaces:

Agents-MCP-Hackathon
/

rss-mcp-server

Running

App Files Files Community

gperdrizet commited on 13 days ago

Commit

6c67813

verified ·

1 Parent(s): b72c075

Added caching of feed content and URI.

Browse files

Files changed (3) hide show

functions/feed_extraction.py +27 -5
functions/tools.py +36 -13
rss_server.py +3 -2

functions/feed_extraction.py CHANGED Viewed

@@ -1,6 +1,8 @@
 '''Helper functions for MCP tools.'''
 import re
 import logging
 import urllib.request
 from urllib.error import HTTPError, URLError
@@ -10,11 +12,15 @@ from boilerpy3 import extractors
 from boilerpy3.exceptions import HTMLExtractionError
 from findfeed import search as feed_search
 from googlesearch import search as google_search
 FEED_URIS = {}
 RSS_EXTENSIONS = ['xml', 'rss', 'atom']
 COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
 def find_feed_uri(website: str) -> str:
     '''Attempts to find URI for RSS feed. First checks if string provided in
@@ -42,14 +48,26 @@ def find_feed_uri(website: str) -> str:
         feed_uri = website
         logger.info('%s looks like a feed URI already - using it directly', website)
-    # Next, check the cache to see if we already have this feed's URI
     elif website in FEED_URIS:
         feed_uri = FEED_URIS[website]
-        logger.info('%s feed URI in cache: %s', website, feed_uri)
-    # If neither of those get it - try feedparse if it looks like a url
     # or else just google it
-    else:
         if website.split('.')[-1] in COMMON_EXTENSIONS:
             website_url = website
             logger.info('%s looks like a website URL', website)
@@ -63,6 +81,10 @@ def find_feed_uri(website: str) -> str:
         FEED_URIS[website] = feed_uri
     return feed_uri

 '''Helper functions for MCP tools.'''
+import os
 import re
+import json
 import logging
 import urllib.request
 from urllib.error import HTTPError, URLError
 from boilerpy3.exceptions import HTMLExtractionError
 from findfeed import search as feed_search
 from googlesearch import search as google_search
+from upstash_redis import Redis
 FEED_URIS = {}
 RSS_EXTENSIONS = ['xml', 'rss', 'atom']
 COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
+REDIS = Redis(
+    url='https://sensible-midge-19304.upstash.io',
+    token=os.environ['UPSTASH_KEY']
+)
 def find_feed_uri(website: str) -> str:
     '''Attempts to find URI for RSS feed. First checks if string provided in
         feed_uri = website
         logger.info('%s looks like a feed URI already - using it directly', website)
+    # Next, check the cache to see if we already have this feed's URI locally
     elif website in FEED_URIS:
         feed_uri = FEED_URIS[website]
+        logger.info('%s feed URI in local cache: %s', website, feed_uri)
+    # Then, check to see if the URI is in the Redis cache
+    cache_key = f"{website.lower().replace(' ', '_')}-feed-uri"
+    cache_hit = False
+    if feed_uri is None:
+        cached_uri = REDIS.get(cache_key)
+        if cached_uri:
+            cache_hit = True
+            feed_uri = cached_uri
+            logger.info('%s feed URI in Redis cache: %s', website, feed_uri)
+    # If none of those get it - try feedparse if it looks like a url
     # or else just google it
+    if feed_uri is None:
         if website.split('.')[-1] in COMMON_EXTENSIONS:
             website_url = website
             logger.info('%s looks like a website URL', website)
         FEED_URIS[website] = feed_uri
+    # Add the feed URI to the redis cache if it wasn't already there
+    if cache_hit is False:
+        REDIS.set(cache_key, feed_uri)
     return feed_uri

functions/tools.py CHANGED Viewed

@@ -1,12 +1,16 @@
 '''Tool functions for MCP server'''
 import json
 import logging
 import functions.feed_extraction as extraction_funcs
 import functions.summarization as summarization_funcs
-def get_feed(website: str) -> list:
     '''Gets RSS feed content from a given website. Can take a website or RSS
     feed URL directly, or the name of a website. Will attempt to find RSS
     feed and return title, summary and link to full article for most recent
@@ -14,30 +18,49 @@ def get_feed(website: str) -> list:
     Args:
         website: URL or name of website to extract RSS feed content from
     Returns:
         JSON string containing the feed content or 'No feed found' if a RSS
         feed for the requested website could not be found
     '''
-    logger = logging.getLogger(__name__ + '.get_content')
     logger.info('Getting feed content for: %s', website)
-    feed_uri = extraction_funcs.find_feed_uri(website)
-    logger.info('find_feed_uri() returned %s', feed_uri)
-    if 'No feed found' in feed_uri:
-        return 'No feed found'
-    content = extraction_funcs.parse_feed(feed_uri)
-    logger.info('parse_feed() returned %s entries', len(list(content.keys())))
-    for i, item in content.items():
-        if item['content'] is not None:
-            summary = summarization_funcs.summarize_content(item['content'])
-            content[i]['summary'] = summary
-        content[i].pop('content', None)
     return json.dumps(content)

 '''Tool functions for MCP server'''
+import time
 import json
 import logging
 import functions.feed_extraction as extraction_funcs
 import functions.summarization as summarization_funcs
+LOCAL_CACHE = {
+    'get_feed': {}
+}
+def get_feed(website: str, use_cache: bool = True) -> list:
     '''Gets RSS feed content from a given website. Can take a website or RSS
     feed URL directly, or the name of a website. Will attempt to find RSS
     feed and return title, summary and link to full article for most recent
     Args:
         website: URL or name of website to extract RSS feed content from
+        use_cache: check local cache for content from RSS feed first before
+            downloading data from the website's RSS feed
     Returns:
         JSON string containing the feed content or 'No feed found' if a RSS
         feed for the requested website could not be found
     '''
+    start_time = time.time()
+    logger = logging.getLogger(__name__ + '.get_feed()')
     logger.info('Getting feed content for: %s', website)
+    # Check to see if we have this feed cached, if desired
+    if use_cache is True and website in LOCAL_CACHE['get_feed']:
+        content = LOCAL_CACHE['get_feed'][website]
+        logger.info('Got feed content from local cache')
+    else:
+        # Find the feed's URI from the website name/URL
+        feed_uri = extraction_funcs.find_feed_uri(website)
+        logger.info('find_feed_uri() returned %s', feed_uri)
+        if 'No feed found' in feed_uri:
+            logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
+            return 'No feed found'
+        # Parse and extract content from the feed
+        content = extraction_funcs.parse_feed(feed_uri)
+        logger.info('parse_feed() returned %s entries', len(list(content.keys())))
+        # Summarize each post in the feed
+        for i, item in content.items():
+            if item['content'] is not None:
+                summary = summarization_funcs.summarize_content(item['content'])
+                content[i]['summary'] = summary
+            content[i].pop('content', None)
+        LOCAL_CACHE['get_feed'][website] = content
+    logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
     return json.dumps(content)

rss_server.py CHANGED Viewed

@@ -1,6 +1,7 @@
 '''Main script to run gradio interface and MCP server.'''
 import logging
 from pathlib import Path
 from logging.handlers import RotatingFileHandler
@@ -17,7 +18,7 @@ Path('logs').mkdir(parents=True, exist_ok=True)
 # Clear old logs if present
 gradio_funcs.delete_old_logs('logs', 'rss_server')
-# Set up the root logger so we catch logs from
 logging.basicConfig(
     handlers=[RotatingFileHandler(
         'logs/rss_server.log',
@@ -29,9 +30,9 @@ logging.basicConfig(
     format='%(levelname)s - %(name)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 with gr.Blocks() as demo:
     # Page text

 '''Main script to run gradio interface and MCP server.'''
 import logging
+from functools import partial
 from pathlib import Path
 from logging.handlers import RotatingFileHandler
 # Clear old logs if present
 gradio_funcs.delete_old_logs('logs', 'rss_server')
+# Set up the root logger so we catch logs from everything
 logging.basicConfig(
     handlers=[RotatingFileHandler(
         'logs/rss_server.log',
     format='%(levelname)s - %(name)s - %(message)s'
 )
+# Get a logger
 logger = logging.getLogger(__name__)
 with gr.Blocks() as demo:
     # Page text