Spaces:

Agents-MCP-Hackathon
/

rss-mcp-server

Running

App Files Files Community

gperdrizet commited on 13 days ago

Commit

3ba8e58

unverified ·

2 Parent(s): 933ca58 dc68696

Merge pull request #14 from gperdrizet/dev

Browse files

Files changed (6) hide show

assets/html.py +6 -5
functions/feed_extraction.py +53 -21
functions/gradio_functions.py +37 -0
functions/summarization.py +21 -13
functions/tools.py +15 -3
rss_server.py +22 -7

assets/html.py CHANGED Viewed

@@ -11,13 +11,14 @@ TITLE = (
 DESCRIPTION = (
     '''
         <p>RSS feed reader MCP server. See
-        <a href="https://huggingface.co/spaces/Agents-MCP-Hackathon/rss-mcp-client">Agentic RSS reader</a>
-        for a demonstration. Check out the
-        <a href="https://github.com/gperdrizet/MCP-hackathon/tree/main">main project repo on GitHub</a>.
-        Both Spaces by <a href="https://www.linkedin.com/in/gperdrizet">George Perdrizet</a>.</p>
         <p>This Space is not meant to be used directly, but you can try out the bare tool below.
-        Enter a website name, website URL, or favorite feed URI. The tool will do it's best
         to find the feed and return titles, links and summaries for the three most recent posts.
         Suggestions: http://openai.com/news/rss.xml, hackernews.com, slashdot, etc.</p>

 DESCRIPTION = (
     '''
         <p>RSS feed reader MCP server. See
+        <a href="https://huggingface.co/spaces/Agents-MCP-Hackathon/rss-mcp-client">
+        Agentic RSS reader</a>for a demonstration. Check out the
+        <a href="https://github.com/gperdrizet/MCP-hackathon/tree/main">
+        main project repo on GitHub</a>. Both Spaces by
+        <a href="https://www.linkedin.com/in/gperdrizet">George Perdrizet</a>.</p>
         <p>This Space is not meant to be used directly, but you can try out the bare tool below.
+        Enter a website name, website URL, or feed URI. The tool will do it's best
         to find the feed and return titles, links and summaries for the three most recent posts.
         Suggestions: http://openai.com/news/rss.xml, hackernews.com, slashdot, etc.</p>

functions/feed_extraction.py CHANGED Viewed

@@ -1,6 +1,8 @@
 '''Helper functions for MCP tools.'''
 import re
 import logging
 import urllib.request
 from urllib.error import HTTPError, URLError
@@ -10,11 +12,15 @@ from boilerpy3 import extractors
 from boilerpy3.exceptions import HTMLExtractionError
 from findfeed import search as feed_search
 from googlesearch import search as google_search
 FEED_URIS = {}
 RSS_EXTENSIONS = ['xml', 'rss', 'atom']
 COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
 def find_feed_uri(website: str) -> str:
     '''Attempts to find URI for RSS feed. First checks if string provided in
@@ -42,14 +48,26 @@ def find_feed_uri(website: str) -> str:
         feed_uri = website
         logger.info('%s looks like a feed URI already - using it directly', website)
-    # Next, check the cache to see if we already have this feed's URI
     elif website in FEED_URIS:
         feed_uri = FEED_URIS[website]
-        logger.info('%s feed URI in cache: %s', website, feed_uri)
-    # If neither of those get it - try feedparse if it looks like a url
     # or else just google it
-    else:
         if website.split('.')[-1] in COMMON_EXTENSIONS:
             website_url = website
             logger.info('%s looks like a website URL', website)
@@ -63,6 +81,10 @@ def find_feed_uri(website: str) -> str:
         FEED_URIS[website] = feed_uri
     return feed_uri
@@ -89,28 +111,38 @@ def parse_feed(feed_uri: str) -> list:
         if 'title' in entry and 'link' in entry:
-            entry_content['title'] = entry.title
-            entry_content['link'] = entry.link
-            # entry_content['updated'] = None
-            # entry_content['summary'] = None
-            entry_content['content'] = None
-            # if 'updated' in entry:
-            #     entry_content['updated'] = entry.updated
-            # if 'summary' in entry:
-            #     summary = _get_text(entry.summary)
-            #     entry_content['summary'] = summary
-            if 'content' in entry:
-                entry_content['content'] = entry.content
-            if entry_content['content'] is None:
-                html = _get_html(entry_content['link'])
-                content = _get_text(html)
-                entry_content['content'] = content
         entries[i] = entry_content

 '''Helper functions for MCP tools.'''
+import os
 import re
+import json
 import logging
 import urllib.request
 from urllib.error import HTTPError, URLError
 from boilerpy3.exceptions import HTMLExtractionError
 from findfeed import search as feed_search
 from googlesearch import search as google_search
+from upstash_redis import Redis
 FEED_URIS = {}
 RSS_EXTENSIONS = ['xml', 'rss', 'atom']
 COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
+REDIS = Redis(
+    url='https://sensible-midge-19304.upstash.io',
+    token=os.environ['UPSTASH_REDIS_KEY']
+)
 def find_feed_uri(website: str) -> str:
     '''Attempts to find URI for RSS feed. First checks if string provided in
         feed_uri = website
         logger.info('%s looks like a feed URI already - using it directly', website)
+    # Next, check the cache to see if we already have this feed's URI locally
     elif website in FEED_URIS:
         feed_uri = FEED_URIS[website]
+        logger.info('%s feed URI in local cache: %s', website, feed_uri)
+    # Then, check to see if the URI is in the Redis cache
+    cache_key = f"{website.lower().replace(' ', '_')}-feed-uri"
+    cache_hit = False
+    if feed_uri is None:
+        cached_uri = REDIS.get(cache_key)
+        if cached_uri:
+            cache_hit = True
+            feed_uri = cached_uri
+            logger.info('%s feed URI in Redis cache: %s', website, feed_uri)
+    # If none of those get it - try feedparse if it looks like a url
     # or else just google it
+    if feed_uri is None:
         if website.split('.')[-1] in COMMON_EXTENSIONS:
             website_url = website
             logger.info('%s looks like a website URL', website)
         FEED_URIS[website] = feed_uri
+    # Add the feed URI to the redis cache if it wasn't already there
+    if cache_hit is False:
+        REDIS.set(cache_key, feed_uri)
     return feed_uri
         if 'title' in entry and 'link' in entry:
+            title = entry.title
+            # Check the Redis cache for this entry
+            cache_key = title.lower().replace(' ', '_')
+            cache_hit = False
+            cached_entry = REDIS.get(cache_key)
+            if cached_entry:
+                cache_hit = True
+                entry_content = json.loads(cached_entry)
+                logger.info('Entry in Redis cache: "%s"', title)
+            # If its not in the Redis cache, parse it from the feed data
+            else:
+                entry_content['title'] = entry.title
+                entry_content['link'] = entry.link
+                entry_content['content'] = None
+                if 'content' in entry:
+                    entry_content['content'] = entry.content
+                if entry_content['content'] is None:
+                    html = _get_html(entry_content['link'])
+                    content = _get_text(html)
+                    entry_content['content'] = content
+                logger.info('Parsed entry: "%s"', title)
+            # Add it to the Redis cache if it wasn't there
+            if cache_hit is False:
+                REDIS.set(cache_key, entry_content)
         entries[i] = entry_content

functions/gradio_functions.py ADDED Viewed

	@@ -0,0 +1,37 @@

+'''Collection of helper functions for Gradio UI and interface.'''
+import os
+import re
+def update_log(n: int = 10):
+    '''Gets updated logging output from disk to display to user.
+    Args:
+        n: number of most recent lines of log output to display
+    Returns:
+        Logging output as string
+    '''
+    with open('logs/rss_server.log', 'r', encoding='utf-8') as log_file:
+        lines = log_file.readlines()
+    return ''.join(lines[-n:])
+def delete_old_logs(directory:str, basename:str) -> None:
+    '''Deletes old log files from previous optimization sessions, if present.
+    Args:
+        directory: path to log file directory as string
+        basename: log file base name as string
+    Returns:
+        None
+    '''
+    for filename in os.listdir(directory):
+        file_path = os.path.join(directory, filename)
+        if re.search(basename, filename):
+            os.remove(file_path)

functions/summarization.py CHANGED Viewed

@@ -4,9 +4,14 @@ import os
 import logging
 from openai import OpenAI
-def summarize_content(content: str) -> str:
     '''Generates summary of article content using Modal inference endpoint.
     Args:
@@ -19,6 +24,15 @@ def summarize_content(content: str) -> str:
     logger = logging.getLogger(__name__ + '.summarize_content')
     logger.info('Summarizing extracted content')
     client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
     client.base_url = (
@@ -29,16 +43,6 @@ def summarize_content(content: str) -> str:
     model = client.models.list().data[0]
     model_id = model.id
-    # messages = [
-    #     {
-    #         'role': 'system',
-    #         'content': ('You are a research assistant, skilled in summarizing documents in just '+
-    #             'a few sentences. Your document summaries should be a maximum of 2 to 4 sentences long.'),
-    #         'role': 'user',
-    #         'content': content
-    #     }
-    # ]
     messages = [
         {
             'role': 'system',
@@ -68,7 +72,11 @@ def summarize_content(content: str) -> str:
         logger.error('Error during Modal API call: %s', e)
     if response is not None:
-        return response.choices[0].message.content
     else:
-        return None

 import logging
 from openai import OpenAI
+from upstash_redis import Redis
+REDIS = Redis(
+    url='https://sensible-midge-19304.upstash.io',
+    token=os.environ['UPSTASH_REDIS_KEY']
+)
+def summarize_content(title: str, content: str) -> str:
     '''Generates summary of article content using Modal inference endpoint.
     Args:
     logger = logging.getLogger(__name__ + '.summarize_content')
     logger.info('Summarizing extracted content')
+    # Check Redis cache for summary
+    cache_key = f"{title.lower().replace(' ', '_')}-summary"
+    cached_summary = REDIS.get(cache_key)
+    if cached_summary:
+        logger.info('Got summary from Redis cache: "%s"', title)
+        return cached_summary
+    # It the summary is not in the cache, generate it
     client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
     client.base_url = (
     model = client.models.list().data[0]
     model_id = model.id
     messages = [
         {
             'role': 'system',
         logger.error('Error during Modal API call: %s', e)
     if response is not None:
+        summary = response.choices[0].message.content
     else:
+        summary = None
+    REDIS.set(cache_key, summary)
+    logger.info('Summarized: "%s"', title)
+    return summary

functions/tools.py CHANGED Viewed

@@ -1,5 +1,6 @@
 '''Tool functions for MCP server'''
 import json
 import logging
 import functions.feed_extraction as extraction_funcs
@@ -10,7 +11,7 @@ def get_feed(website: str) -> list:
     '''Gets RSS feed content from a given website. Can take a website or RSS
     feed URL directly, or the name of a website. Will attempt to find RSS
     feed and return title, summary and link to full article for most recent
-    items in feed
     Args:
         website: URL or name of website to extract RSS feed content from
@@ -20,24 +21,35 @@ def get_feed(website: str) -> list:
         feed for the requested website could not be found
     '''
-    logger = logging.getLogger(__name__ + '.get_content')
     logger.info('Getting feed content for: %s', website)
     feed_uri = extraction_funcs.find_feed_uri(website)
     logger.info('find_feed_uri() returned %s', feed_uri)
     if 'No feed found' in feed_uri:
         return 'No feed found'
     content = extraction_funcs.parse_feed(feed_uri)
     logger.info('parse_feed() returned %s entries', len(list(content.keys())))
     for i, item in content.items():
         if item['content'] is not None:
-            summary = summarization_funcs.summarize_content(item['content'])
             content[i]['summary'] = summary
         content[i].pop('content', None)
     return json.dumps(content)

 '''Tool functions for MCP server'''
+import time
 import json
 import logging
 import functions.feed_extraction as extraction_funcs
     '''Gets RSS feed content from a given website. Can take a website or RSS
     feed URL directly, or the name of a website. Will attempt to find RSS
     feed and return title, summary and link to full article for most recent
+    items in feed.
     Args:
         website: URL or name of website to extract RSS feed content from
         feed for the requested website could not be found
     '''
+    start_time = time.time()
+    logger = logging.getLogger(__name__ + '.get_feed()')
     logger.info('Getting feed content for: %s', website)
+    # Find the feed's URI from the website name/URL
     feed_uri = extraction_funcs.find_feed_uri(website)
     logger.info('find_feed_uri() returned %s', feed_uri)
     if 'No feed found' in feed_uri:
+        logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
         return 'No feed found'
+    # Parse and extract content from the feed
     content = extraction_funcs.parse_feed(feed_uri)
     logger.info('parse_feed() returned %s entries', len(list(content.keys())))
+    # Summarize each post in the feed
     for i, item in content.items():
         if item['content'] is not None:
+            summary = summarization_funcs.summarize_content(
+                item['title'],
+                item['content']
+            )
             content[i]['summary'] = summary
         content[i].pop('content', None)
+    logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
     return json.dumps(content)

rss_server.py CHANGED Viewed

@@ -1,19 +1,24 @@
 '''Main script to run gradio interface and MCP server.'''
 import logging
 from pathlib import Path
 from logging.handlers import RotatingFileHandler
 import gradio as gr
 import assets.html as html
-from functions import tools as tool_funcs
 # Make sure log directory exists
 Path('logs').mkdir(parents=True, exist_ok=True)
-# Set-up logger
-logger = logging.getLogger()
 logging.basicConfig(
     handlers=[RotatingFileHandler(
         'logs/rss_server.log',
@@ -25,15 +30,25 @@ logging.basicConfig(
     format='%(levelname)s - %(name)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 with gr.Blocks() as demo:
-    with gr.Row():
-        gr.HTML(html.TITLE)
-    gr.Markdown(html.DESCRIPTION)
     website_url = gr.Textbox('hackernews.com', label='Website')
     output = gr.Textbox(label='RSS entries', lines=10)
     submit_button = gr.Button('Submit')

 '''Main script to run gradio interface and MCP server.'''
 import logging
+from functools import partial
 from pathlib import Path
 from logging.handlers import RotatingFileHandler
 import gradio as gr
 import assets.html as html
+import functions.tools as tool_funcs
+import functions.gradio_functions as gradio_funcs
+# Set-up logging
 # Make sure log directory exists
 Path('logs').mkdir(parents=True, exist_ok=True)
+# Clear old logs if present
+gradio_funcs.delete_old_logs('logs', 'rss_server')
+# Set up the root logger so we catch logs from everything
 logging.basicConfig(
     handlers=[RotatingFileHandler(
         'logs/rss_server.log',
     format='%(levelname)s - %(name)s - %(message)s'
 )
+# Get a logger
 logger = logging.getLogger(__name__)
 with gr.Blocks() as demo:
+    # Page text
+    gr.HTML(html.TITLE)
+    gr.HTML(html.DESCRIPTION)
+    # Log output
+    dialog_output = gr.Textbox(label='Server logs', lines=10, max_lines=100)
+    timer = gr.Timer(0.5, active=True)
+    timer.tick( # pylint: disable=no-member
+        lambda: gradio_funcs.update_log(), # pylint: disable=unnecessary-lambda
+        outputs=dialog_output
+    )
+    # Get feed tool
     website_url = gr.Textbox('hackernews.com', label='Website')
     output = gr.Textbox(label='RSS entries', lines=10)
     submit_button = gr.Button('Submit')