Spaces:

Agents-MCP-Hackathon
/

rss-mcp-server

Running

App Files Files Community

gperdrizet commited on 16 days ago

Commit

8c04b7a

unverified ·

2 Parent(s): aa0f12c 4d43357

Merge pull request #10 from gperdrizet/dev

Browse files

Files changed (6) hide show

assets/html.py +10 -9
functions/{helper_functions.py → feed_extraction.py} +67 -63
functions/summarization.py +74 -0
functions/tools.py +20 -8
requirements.txt +2 -1
rss_server.py +1 -1

assets/html.py CHANGED Viewed

@@ -3,24 +3,25 @@
 TITLE = (
     '''
         <center>
-            <h1>RSS feed finder/extractor</h1>
         </center>
     '''
 )
 DESCRIPTION = (
     '''
-        Functions to find and extract RSS feeds are complete-ish. No AI
-        yet, plan for tomorrow is to build two tools:
         <ol>
-            <li>Human readable summaries of requested RSS feed</li>
-            <li>Simple RAG on requested RSS feed content</li>
         </ol>
-        For now we just dump the extracted RSS content below. Try asking
-        for a feed by website name, website URL, or entering your favorite
-        feed URI directly. Suggestions: http://openai.com/news/rss.xml,
-        hackernews.com, Hugging Face, etc
     '''
 )

 TITLE = (
     '''
         <center>
+            <h1>RSS feed reader</h1>
         </center>
     '''
 )
 DESCRIPTION = (
     '''
+        Functions to find, extract and summarize RSS feeds are complete.
+        <h2>Tools</h2>
         <ol>
+            <li><b>DONE</b> Given a website name or URL, find its RSS feed and return recent
+                article titles, links and a generated summary of content if avalible</li>
+            <li><b>TODO</b> Simple RAG on requested RSS feed content</li>
         </ol>
+        For now we dump the extracted RSS title, link and summary below. Try asking for a
+        feed by website name, website URL, or entering your favorite feed URI directly.
+        Suggestions: http://openai.com/news/rss.xml, hackernews.com, Hugging Face, etc
     '''
 )

functions/{helper_functions.py → feed_extraction.py} RENAMED Viewed

@@ -55,10 +55,10 @@ def find_feed_uri(website: str) -> str:
             logger.info('%s looks like a website URL', website)
         else:
-            website_url = get_url(website)
             logger.info('Google result for %s: %s', website, website_url)
-        feed_uri = get_feed(website_url)
         logger.info('get_feed() returned %s', feed_uri)
         FEED_URIS[website] = feed_uri
@@ -66,52 +66,6 @@ def find_feed_uri(website: str) -> str:
     return feed_uri
-def get_url(company_name: str) -> str:
-    '''Finds the website associated with the name of a company or
-    publication.
-    Args:
-        company_name: the name of the company, publication or site to find
-        the URL for
-    Returns:
-        The URL for the company, publication or website.
-    '''
-    logger = logging.getLogger(__name__ + '.get_url')
-    logger.info('Getting website URL for %s', company_name)
-    query = f'{company_name} official website'
-    for url in google_search(query, num_results=5):
-        if 'facebook' not in url and 'linkedin' not in url:
-            return url
-    return None
-def get_feed(website_url: str) -> str:
-    '''Finds the RSS feed URI for a website given the website's url.
-    Args:
-        website_url: The url for the website to find the RSS feed for
-    Returns:
-        The website's RSS feed URI as a string
-    '''
-    logger = logging.getLogger(__name__ + '.get_content')
-    logger.info('Getting feed URI for: %s', website_url)
-    feeds = feed_search(website_url)
-    if len(feeds) > 0:
-        return str(feeds[0].url)
-    else:
-        return f'No feed found for {website_url}'
 def parse_feed(feed_uri: str) -> list:
     '''Gets content from a remote RSS feed URI.
@@ -138,28 +92,29 @@ def parse_feed(feed_uri: str) -> list:
             entry_content['title'] = entry.title
             entry_content['link'] = entry.link
-            entry_content['updated'] = None
-            entry_content['summary'] = None
             entry_content['content'] = None
-            if 'updated' in entry:
-                entry_content['updated'] = entry.updated
-            if 'summary' in entry:
-                summary = get_text(entry.summary)
-                entry_content['summary'] = summary
             if 'content' in entry:
                 entry_content['content'] = entry.content
-            html = get_html(entry_content['link'])
-            content = get_text(html)
-            entry_content['extracted_content'] = content
         entries[i] = entry_content
-        if i == 9:
             break
     logger.info('Entries contains %s elements', len(list(entries.keys())))
@@ -167,7 +122,53 @@ def parse_feed(feed_uri: str) -> list:
     return entries
-def get_html(url: str) -> str:
     '''Gets HTML string content from url
     Args:
@@ -221,7 +222,7 @@ def get_html(url: str) -> str:
     return content
-def get_text(html: str) -> str:
     '''Uses boilerpy3 extractor and regex cribbed from old NLTK clean_html
     function to try and extract text from HTML as cleanly as possible.
@@ -248,10 +249,10 @@ def get_text(html: str) -> str:
     except TypeError:
         pass
-    return clean_html(html)
-def clean_html(html: str) -> str:
     '''
     Remove HTML markup from the given string.
@@ -262,6 +263,9 @@ def clean_html(html: str) -> str:
         Cleaned string
     '''
     # First we remove inline JavaScript/CSS:
     cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())

             logger.info('%s looks like a website URL', website)
         else:
+            website_url = _get_url(website)
             logger.info('Google result for %s: %s', website, website_url)
+        feed_uri = _get_feed(website_url)
         logger.info('get_feed() returned %s', feed_uri)
         FEED_URIS[website] = feed_uri
     return feed_uri
 def parse_feed(feed_uri: str) -> list:
     '''Gets content from a remote RSS feed URI.
             entry_content['title'] = entry.title
             entry_content['link'] = entry.link
+            # entry_content['updated'] = None
+            # entry_content['summary'] = None
             entry_content['content'] = None
+            # if 'updated' in entry:
+            #     entry_content['updated'] = entry.updated
+            # if 'summary' in entry:
+            #     summary = _get_text(entry.summary)
+            #     entry_content['summary'] = summary
             if 'content' in entry:
                 entry_content['content'] = entry.content
+            if entry_content['content'] is None:
+                html = _get_html(entry_content['link'])
+                content = _get_text(html)
+                entry_content['content'] = content
         entries[i] = entry_content
+        if i == 2:
             break
     logger.info('Entries contains %s elements', len(list(entries.keys())))
     return entries
+def _get_url(company_name: str) -> str:
+    '''Finds the website associated with the name of a company or
+    publication.
+    Args:
+        company_name: the name of the company, publication or site to find
+        the URL for
+    Returns:
+        The URL for the company, publication or website.
+    '''
+    logger = logging.getLogger(__name__ + '.get_url')
+    logger.info('Getting website URL for %s', company_name)
+    query = f'{company_name} official website'
+    for url in google_search(query, num_results=5):
+        if 'facebook' not in url and 'linkedin' not in url:
+            return url
+    return None
+def _get_feed(website_url: str) -> str:
+    '''Finds the RSS feed URI for a website given the website's url.
+    Args:
+        website_url: The url for the website to find the RSS feed for
+    Returns:
+        The website's RSS feed URI as a string
+    '''
+    logger = logging.getLogger(__name__ + '.get_content')
+    logger.info('Getting feed URI for: %s', website_url)
+    feeds = feed_search(website_url)
+    if len(feeds) > 0:
+        return str(feeds[0].url)
+    else:
+        return f'No feed found for {website_url}'
+def _get_html(url: str) -> str:
     '''Gets HTML string content from url
     Args:
     return content
+def _get_text(html: str) -> str:
     '''Uses boilerpy3 extractor and regex cribbed from old NLTK clean_html
     function to try and extract text from HTML as cleanly as possible.
     except TypeError:
         pass
+    return _clean_html(html)
+def _clean_html(html: str) -> str:
     '''
     Remove HTML markup from the given string.
         Cleaned string
     '''
+    if html is None:
+        return None
     # First we remove inline JavaScript/CSS:
     cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())

functions/summarization.py ADDED Viewed

	@@ -0,0 +1,74 @@

+'''Functions to summarize article content.'''
+import os
+import logging
+from openai import OpenAI
+def summarize_content(content: str) -> str:
+    '''Generates summary of article content using Modal inference endpoint.
+    Args:
+        content: string containing the text content to be summarized
+    Returns:
+        Summarized text as string
+    '''
+    logger = logging.getLogger(__name__ + '.summarize_content')
+    logger.info('Summarizing extracted content')
+    client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
+    client.base_url = (
+        'https://gperdrizet--vllm-openai-compatible-summarization-serve.modal.run/v1'
+    )
+    # Default to first avalible model
+    model = client.models.list().data[0]
+    model_id = model.id
+    # messages = [
+    #     {
+    #         'role': 'system',
+    #         'content': ('You are a research assistant, skilled in summarizing documents in just '+
+    #             'a few sentences. Your document summaries should be a maximum of 2 to 4 sentences long.'),
+    #         'role': 'user',
+    #         'content': content
+    #     }
+    # ]
+    messages = [
+        {
+            'role': 'system',
+            'content': f'Summarize the following text in 50 words returning only the summary: {content}'
+        }
+    ]
+    completion_args = {
+        'model': model_id,
+        'messages': messages,
+        # "frequency_penalty": args.frequency_penalty,
+        # "max_tokens": 128,
+        # "n": args.n,
+        # "presence_penalty": args.presence_penalty,
+        # "seed": args.seed,
+        # "stop": args.stop,
+        # "stream": args.stream,
+        # "temperature": args.temperature,
+        # "top_p": args.top_p,
+    }
+    try:
+        response = client.chat.completions.create(**completion_args)
+    except Exception as e: # pylint: disable=broad-exception-caught
+        response = None
+        logger.error('Error during Modal API call: %s', e)
+    if response is not None:
+        return response.choices[0].message.content
+    else:
+        return None

functions/tools.py CHANGED Viewed

@@ -2,30 +2,42 @@
 import json
 import logging
-import functions.helper_functions as helper_funcs
-def get_content(website: str) -> list:
-    '''Gets RSS feed content from a given website.
     Args:
-        website_url: URL or nam of website to extract RSS feed content from
     Returns:
-        List of titles for the 10 most recent entries in the RSS feed from the
-        requested website.
     '''
     logger = logging.getLogger(__name__ + '.get_content')
     logger.info('Getting feed content for: %s', website)
-    feed_uri = helper_funcs.find_feed_uri(website)
     logger.info('find_feed_uri() returned %s', feed_uri)
     if 'No feed found' in feed_uri:
         return 'No feed found'
-    content = helper_funcs.parse_feed(feed_uri)
     logger.info('parse_feed() returned %s entries', len(list(content.keys())))
     return json.dumps(content)

 import json
 import logging
+import functions.feed_extraction as extraction_funcs
+import functions.summarization as summarization_funcs
+def get_feed(website: str) -> list:
+    '''Gets RSS feed content from a given website. Can take a website or RSS
+    feed URL directly, or the name of a website. Will attempt to find RSS
+    feed and return title, summary and link to full article for most recent
+    items in feed
     Args:
+        website: URL or name of website to extract RSS feed content from
     Returns:
+        JSON string containing the feed content or 'No feed found' if a RSS
+        feed for the requested website could not be found
     '''
     logger = logging.getLogger(__name__ + '.get_content')
     logger.info('Getting feed content for: %s', website)
+    feed_uri = extraction_funcs.find_feed_uri(website)
     logger.info('find_feed_uri() returned %s', feed_uri)
     if 'No feed found' in feed_uri:
         return 'No feed found'
+    content = extraction_funcs.parse_feed(feed_uri)
     logger.info('parse_feed() returned %s entries', len(list(content.keys())))
+    for i, item in content.items():
+        if item['content'] is not None:
+            summary = summarization_funcs.summarize_content(item['content'])
+            content[i]['summary'] = summary
+        content[i].pop('content', None)
     return json.dumps(content)

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ feedparser
 findfeed
 googlesearch-python
 gradio
-mcp

 findfeed
 googlesearch-python
 gradio
+mcp
+openai

rss_server.py CHANGED Viewed

@@ -39,7 +39,7 @@ with gr.Blocks() as demo:
     submit_button = gr.Button('Submit')
     submit_button.click( # pylint: disable=no-member
-        fn=tool_funcs.get_content,
         inputs=website_url,
         outputs=output,
         api_name='Get RSS feed content'

     submit_button = gr.Button('Submit')
     submit_button.click( # pylint: disable=no-member
+        fn=tool_funcs.get_feed,
         inputs=website_url,
         outputs=output,
         api_name='Get RSS feed content'