Spaces:

Agents-MCP-Hackathon
/

rss-mcp-server

Running

App Files Files Community

gperdrizet commited on 23 days ago

Commit

fb7ecc7

verified ·

1 Parent(s): a3e696b

Improved caching

Browse files

Files changed (6) hide show

functions/feed_extraction.py +22 -19
functions/gradio_functions.py +2 -2
functions/rag.py +1 -10
functions/summarization.py +3 -1
functions/tools.py +33 -15
rss_server.py +3 -3

functions/feed_extraction.py CHANGED Viewed

@@ -2,7 +2,6 @@
 import os
 import re
-import json
 import logging
 import urllib.request
 from urllib.error import HTTPError, URLError
@@ -53,8 +52,8 @@ def find_feed_uri(website: str) -> str:
         feed_uri = FEED_URIS[website]
         logger.info('%s feed URI in local cache: %s', website, feed_uri)
-    # Then, check to see if the URI is in the Redis cache
-    cache_key = f"{website.lower().replace(' ', '_')}-feed-uri"
     cache_hit = False
     if feed_uri is None:
@@ -65,7 +64,7 @@ def find_feed_uri(website: str) -> str:
             feed_uri = cached_uri
             logger.info('%s feed URI in Redis cache: %s', website, feed_uri)
-    # If none of those get it - try feedparse if it looks like a url
     # or else just google it
     if feed_uri is None:
         if website.split('.')[-1] in COMMON_EXTENSIONS:
@@ -79,6 +78,7 @@ def find_feed_uri(website: str) -> str:
         feed_uri = _get_feed(website_url)
         logger.info('get_feed() returned %s', feed_uri)
         FEED_URIS[website] = feed_uri
     # Add the feed URI to the redis cache if it wasn't already there
@@ -88,14 +88,16 @@ def find_feed_uri(website: str) -> str:
     return feed_uri
-def parse_feed(feed_uri: str) -> list:
     '''Gets content from a remote RSS feed URI.
     Args:
         feed_uri: The RSS feed to get content from
     Returns:
-        List of titles for the 10 most recent entries in the RSS feed.
     '''
     logger = logging.getLogger(__name__ + '.parse_feed')
@@ -112,16 +114,15 @@ def parse_feed(feed_uri: str) -> list:
         if 'title' in entry and 'link' in entry:
             title = entry.title
-            # Check the Redis cache for this entry
-            cache_key = title.lower().replace(' ', '_')
-            cache_hit = False
-            cached_entry = REDIS.get(cache_key)
-            if cached_entry:
-                cache_hit = True
-                entry_content = json.loads(cached_entry)
                 logger.info('Entry in Redis cache: "%s"', title)
             # If its not in the Redis cache, parse it from the feed data
             else:
@@ -129,24 +130,26 @@ def parse_feed(feed_uri: str) -> list:
                 entry_content['link'] = entry.link
                 entry_content['content'] = None
                 if 'content' in entry:
                     entry_content['content'] = entry.content
-                if entry_content['content'] is None:
                     html = _get_html(entry_content['link'])
                     content = _get_text(html)
                     entry_content['content'] = content
-                logger.info('Parsed entry: "%s"', title)
-            # Add it to the Redis cache if it wasn't there
-            if cache_hit is False:
-                REDIS.set(cache_key, entry_content)
         entries[i] = entry_content
-        if i == 2:
             break
     logger.info('Entries contains %s elements', len(list(entries.keys())))

 import os
 import re
 import logging
 import urllib.request
 from urllib.error import HTTPError, URLError
         feed_uri = FEED_URIS[website]
         logger.info('%s feed URI in local cache: %s', website, feed_uri)
+    # If we still haven't found it, check to see if the URI is in the Redis cache
+    cache_key = f'{website} feed uri'
     cache_hit = False
     if feed_uri is None:
             feed_uri = cached_uri
             logger.info('%s feed URI in Redis cache: %s', website, feed_uri)
+    # If still none of those methods get it - try feedparse if it looks like a url
     # or else just google it
     if feed_uri is None:
         if website.split('.')[-1] in COMMON_EXTENSIONS:
         feed_uri = _get_feed(website_url)
         logger.info('get_feed() returned %s', feed_uri)
+        # Add to local cache
         FEED_URIS[website] = feed_uri
     # Add the feed URI to the redis cache if it wasn't already there
     return feed_uri
+def parse_feed(feed_uri: str, n: int) -> list:
     '''Gets content from a remote RSS feed URI.
     Args:
         feed_uri: The RSS feed to get content from
+        n: the number of feed entries to parse
     Returns:
+        List of dictionaries for the n most recent entries in the RSS feed.
+        Each dictionary contains 'title', 'link' and 'content' keys.
     '''
     logger = logging.getLogger(__name__ + '.parse_feed')
         if 'title' in entry and 'link' in entry:
             title = entry.title
+            entry_content['title'] = title
+            # Check the Redis cache
+            cached_link = REDIS.get(f'{title} link')
+            if cached_link:
                 logger.info('Entry in Redis cache: "%s"', title)
+                entry_content['link'] = cached_link
+                entry_content['content'] = REDIS.get(f'{title} content')
             # If its not in the Redis cache, parse it from the feed data
             else:
                 entry_content['link'] = entry.link
                 entry_content['content'] = None
+                # Grab the article content from the feed, if provided
                 if 'content' in entry:
                     entry_content['content'] = entry.content
+                # If not, try to get the article content from the link
+                elif entry_content['content'] is None:
                     html = _get_html(entry_content['link'])
                     content = _get_text(html)
                     entry_content['content'] = content
+                # Add everything to the cache
+                REDIS.set(f'{title} link', entry_content['link'])
+                REDIS.set(f'{title} content', entry_content['content'])
+                logger.info('Parsed entry: "%s"', title)
         entries[i] = entry_content
+        if i == n-1:
             break
     logger.info('Entries contains %s elements', len(list(entries.keys())))

functions/gradio_functions.py CHANGED Viewed

@@ -6,7 +6,7 @@ import logging
 from openai import OpenAI
-async def call_modal() -> None:
     '''Sends request to Modal to spin up container'''
     logger = logging.getLogger(__name__ + '.call_modal()')
@@ -30,7 +30,7 @@ async def call_modal() -> None:
         }
     ]
-    logger.info('Prompt: %s': messages[0]['content'])
     completion_args = {
         'model': model_id,

 from openai import OpenAI
+def call_modal() -> None:
     '''Sends request to Modal to spin up container'''
     logger = logging.getLogger(__name__ + '.call_modal()')
         }
     ]
+    logger.info('Prompt: %s', messages[0]['content'])
     completion_args = {
         'model': model_id,

functions/rag.py CHANGED Viewed

@@ -24,7 +24,7 @@ def ingest(rag_ingest_queue: queue.Queue) -> None:
         namespaces = index.list_namespaces()
         item = rag_ingest_queue.get()
-        logger.info(item)
         title = item['title']
         if title not in namespaces:
@@ -36,15 +36,6 @@ def ingest(rag_ingest_queue: queue.Queue) -> None:
             chunks=splitter.chunks(text)
             for i, chunk in enumerate(chunks):
-                # index.upsert(
-                #     vectors=[
-                #         Vector(
-                #             id=hash(f'{title}-{i}'),
-                #             data=chunk,
-                #         )
-                #     ],
-                #     namespace=title
-                # )
                 index.upsert(
                     [

         namespaces = index.list_namespaces()
         item = rag_ingest_queue.get()
+        logger.info('Upserting "%s": %s', item['title'], item)
         title = item['title']
         if title not in namespaces:
             chunks=splitter.chunks(text)
             for i, chunk in enumerate(chunks):
                 index.upsert(
                     [

functions/summarization.py CHANGED Viewed

@@ -25,7 +25,7 @@ def summarize_content(title: str, content: str) -> str:
     logger.info('Summarizing extracted content')
     # Check Redis cache for summary
-    cache_key = f"{title.lower().replace(' ', '_')}-summary"
     cached_summary = REDIS.get(cache_key)
     if cached_summary:
@@ -77,6 +77,8 @@ def summarize_content(title: str, content: str) -> str:
     else:
         summary = None
     REDIS.set(cache_key, summary)
     logger.info('Summarized: "%s"', title)
     return summary

     logger.info('Summarizing extracted content')
     # Check Redis cache for summary
+    cache_key = f'{title}  summary'
     cached_summary = REDIS.get(cache_key)
     if cached_summary:
     else:
         summary = None
+    # Add the new summary to the cache
     REDIS.set(cache_key, summary)
     logger.info('Summarized: "%s"', title)
     return summary

functions/tools.py CHANGED Viewed

@@ -6,7 +6,7 @@ import time
 import json
 import logging
 import queue
-from upstash_vector import Index, Vector
 import functions.feed_extraction as extraction_funcs
 import functions.summarization as summarization_funcs
@@ -22,14 +22,17 @@ rag_ingest_thread = threading.Thread(
 rag_ingest_thread.start()
-def get_feed(website: str) -> list:
     '''Gets RSS feed content from a given website. Can take a website or RSS
     feed URL directly, or the name of a website. Will attempt to find RSS
     feed and return title, summary and link to full article for most recent
-    items in feed.
     Args:
         website: URL or name of website to extract RSS feed content from
     Returns:
         JSON string containing the feed content or 'No feed found' if a RSS
@@ -50,44 +53,57 @@ def get_feed(website: str) -> list:
         return 'No feed found'
     # Parse and extract content from the feed
-    content = extraction_funcs.parse_feed(feed_uri)
-    logger.info('parse_feed() returned %s entries', len(list(content.keys())))
-    # Summarize each post in the feed and submit full text for RAG ingest
-    for i, item in content.items():
         if item['content'] is not None:
-            RAG_INGEST_QUEUE.put(item)
             logger.info('"%s" sent to RAG ingest', item['title'])
             summary = summarization_funcs.summarize_content(
                 item['title'],
                 item['content']
             )
-            content[i]['summary'] = summary
             logger.info('Summary of "%s" generated', item['title'])
-        content[i].pop('content', None)
     logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
-    return json.dumps(content)
 def context_search(query: str, article_title: str = None) -> str:
-    '''Searches for context relevant to query in article vector store.
     Ags:
         query: user query to find context for
-        article_title: optional, use this argument to search only for context
-            from a specific context
     Returns:
-        Context which bests matches query as string.
     '''
     index = Index(
         url='https://living-whale-89944-us1-vector.upstash.io',
         token=os.environ['UPSTASH_VECTOR_KEY']
@@ -103,4 +119,6 @@ def context_search(query: str, article_title: str = None) -> str:
         namespace=article_title
     )
     return results

 import json
 import logging
 import queue
+from upstash_vector import Index
 import functions.feed_extraction as extraction_funcs
 import functions.summarization as summarization_funcs
 rag_ingest_thread.start()
+def get_feed(website: str, n: int = 3) -> list:
     '''Gets RSS feed content from a given website. Can take a website or RSS
     feed URL directly, or the name of a website. Will attempt to find RSS
     feed and return title, summary and link to full article for most recent
+    n items in feed. This function is slow a resource heavy, only call it when
+    the user wants to check a feed for new content, or asks for content from a
+    feed that you have not retrieved yet.
     Args:
         website: URL or name of website to extract RSS feed content from
+        n: (optional) number of articles to parse from feed, defaults to 3
     Returns:
         JSON string containing the feed content or 'No feed found' if a RSS
         return 'No feed found'
     # Parse and extract content from the feed
+    articles = extraction_funcs.parse_feed(feed_uri, n)
+    logger.info('parse_feed() returned %s entries', len(list(articles.keys())))
+    # Loop on the posts, sending them to RAG (nonblocking) and summarization (blocking)
+    for i, item in articles.items():
+        # Check if content is present
         if item['content'] is not None:
+            logger.info('Summarizing/RAG ingesting: %s', item)
+            # Send to RAG ingest
+            RAG_INGEST_QUEUE.put(item.copy())
             logger.info('"%s" sent to RAG ingest', item['title'])
+            # Generate summary and add to content
             summary = summarization_funcs.summarize_content(
                 item['title'],
                 item['content']
             )
+            articles[i]['summary'] = summary
             logger.info('Summary of "%s" generated', item['title'])
+        # Remove full-text content before returning
+        articles[i].pop('content', None)
     logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
+    # Return content dictionary as string
+    return json.dumps(articles)
 def context_search(query: str, article_title: str = None) -> str:
+    '''Searches for context relevant to query in article vector store.
+    Use this Function to search for additional information before
+    answering the user's question about an article. If article_title is
+    provided the search will only return results from that article. If
+    article_title is omitted, the search will include all articles
+    currently in the cache.
     Ags:
         query: user query to find context for
+        article_title: optional, use this argument to search only for
+        context from a specific context, defaults to None
     Returns:
+        Context information which bests matches the query.
     '''
+    logger = logging.getLogger(__name__ + 'context_search')
     index = Index(
         url='https://living-whale-89944-us1-vector.upstash.io',
         token=os.environ['UPSTASH_VECTOR_KEY']
         namespace=article_title
     )
+    logger.info('Retrieved %s chunks for "%s"', len(results), query)
     return results

rss_server.py CHANGED Viewed

@@ -1,6 +1,5 @@
 '''Main script to run gradio interface and MCP server.'''
-import asyncio
 import logging
 from pathlib import Path
 from logging.handlers import RotatingFileHandler
@@ -10,8 +9,9 @@ import assets.html as html
 import functions.tools as tool_funcs
 import functions.gradio_functions as gradio_funcs
-# Call the modal container so it spins up
-asyncio.run(gradio_funcs.call_modal())
 # Set-up logging
 # Make sure log directory exists

 '''Main script to run gradio interface and MCP server.'''
 import logging
 from pathlib import Path
 from logging.handlers import RotatingFileHandler
 import functions.tools as tool_funcs
 import functions.gradio_functions as gradio_funcs
+# Call the modal container so it spins up before the rest of
+# the app starts
+gradio_funcs.call_modal()
 # Set-up logging
 # Make sure log directory exists