Spaces:

Agents-MCP-Hackathon
/

rss-mcp-server

Runtime error

App Files Files Community

gperdrizet commited on Jun 5

Commit

e97f932

unverified ·

1 Parent(s): 3937afc

Added page content extraction and HTML cleaning functions.

Browse files

Files changed (3) hide show

functions/helper_functions.py +196 -9
functions/tools.py +7 -34
requirements.txt +1 -0

functions/helper_functions.py CHANGED Viewed

@@ -1,12 +1,71 @@
 '''Helper functions for MCP tools.'''
 import logging
-from types import GeneratorType
 import feedparser
 from findfeed import search as feed_search
 from googlesearch import search as google_search
 def get_url(company_name: str) -> str:
     '''Finds the website associated with the name of a company or
     publication.
@@ -66,18 +125,146 @@ def parse_feed(feed_uri: str) -> list:
     logger = logging.getLogger(__name__ + '.parse_feed')
     feed = feedparser.parse(feed_uri)
-    logger.info('%s yieled %s entries', feed_uri, len(feed.entries))
-    titles = []
-    for entry in feed.entries:
-        logger.debug('Entry attributes: %s', list(entry.keys()))
-        if 'title' in entry:
-            titles.append(entry.title)
-        if len(titles) >= 10:
             break
-    return titles

 '''Helper functions for MCP tools.'''
+import re
 import logging
+import urllib.request
+from urllib.error import HTTPError, URLError
 import feedparser
+from boilerpy3 import extractors
+from boilerpy3.exceptions import HTMLExtractionError
 from findfeed import search as feed_search
 from googlesearch import search as google_search
+FEED_URIS = {}
+RSS_EXTENSIONS = ['xml', 'rss', 'atom']
+COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
+def find_feed_uri(website: str) -> str:
+    '''Attempts to find URI for RSS feed. First checks if string provided in
+    website is a feed URI, it it's not, checks if website is a URL, if so,
+    uses that to find the RSS feed URI. If the provided string is neither,
+    defaults to Google search to find website URL and then uses that to try
+    and find the Feed.
+    Args:
+        website: target resource to find RSS feed URI for, can be website URL or
+        name of website
+    Returns:
+        RSS feed URI for website
+    '''
+    logger = logging.getLogger(__name__ + '.find_feed_uri')
+    logger.info('Finding feed URI for %s', website)
+    # Find the feed URI
+    feed_uri = None
+    # If the website contains xml, rss or atom, assume it's an RSS URI
+    if any(extension in website.lower() for extension in RSS_EXTENSIONS):
+        feed_uri = website
+        logger.info('%s looks like a feed URI already - using it directly', website)
+    # Next, check the cache to see if we already have this feed's URI
+    elif website in FEED_URIS:
+        feed_uri = FEED_URIS[website]
+        logger.info('%s feed URI in cache: %s', website, feed_uri)
+    # If neither of those get it - try feedparse if it looks like a url
+    # or else just google it
+    else:
+        if website.split('.')[-1] in COMMON_EXTENSIONS:
+            website_url = website
+            logger.info('%s looks like a website URL', website)
+        else:
+            website_url = get_url(website)
+            logger.info('Google result for %s: %s', website, website_url)
+        feed_uri = get_feed(website_url)
+        logger.info('get_feed() returned %s', feed_uri)
+        FEED_URIS[website] = feed_uri
+    return feed_uri
 def get_url(company_name: str) -> str:
     '''Finds the website associated with the name of a company or
     publication.
     logger = logging.getLogger(__name__ + '.parse_feed')
     feed = feedparser.parse(feed_uri)
+    logger.info('%s yielded %s entries', feed_uri, len(feed.entries))
+    entries = {}
+    for i, entry in enumerate(feed.entries):
+        entry_content = {}
+        if 'title' in entry and 'link' in entry:
+            entry_content['title'] = entry.title
+            entry_content['link'] = entry.link
+            entry_content['updated'] = None
+            entry_content['summary'] = None
+            entry_content['content'] = None
+            if 'updated' in entry:
+                entry_content['updated'] = entry.updated
+            if 'summary' in entry:
+                summary = get_text(entry.summary)
+                entry_content['summary'] = summary
+            if 'content' in entry:
+                entry_content['content'] = entry.content
+            html = get_html(entry_content['link'])
+            content = get_text(html)
+            entry_content['extracted_content'] = content
+        entries[i] = entry_content
+        if i == 9:
             break
+    logger.info('Entries contains %s elements', len(list(entries.keys())))
+    return entries
+def get_html(url: str) -> str:
+    '''Gets HTML string content from url
+    Args:
+        url: the webpage to extract content from
+    Returns:
+        Webpage HTML source as string
+    '''
+    header={
+        "Accept": ("text/html,application/xhtml+xml,application/xml;q=0.9,image/avif," +
+                   "image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"),
+        "Accept-Language": "en-US,en;q=0.9",
+        "Connection": "keep-alive",
+        "Sec-Fetch-Site": "cross-site",
+        "Sec-Fetch-User": "?1",
+        "Upgrade-Insecure-Requests": "1",
+        "User-Agent": ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36" +
+                       "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
+    }
+    # Create the request with header
+    request_params = urllib.request.Request(
+        url=url,
+        headers=header
+    )
+    # Get the html string
+    try:
+        with urllib.request.urlopen(request_params) as response:
+            status_code = response.getcode()
+            if status_code == 200:
+                content = response.read()
+                encoding = response.headers.get_content_charset()
+                if encoding is None:
+                    encoding = "utf-8"
+                content = content.decode(encoding)
+    except HTTPError:
+        content = None
+    except URLError:
+        content = None
+    return content
+def get_text(html: str) -> str:
+    '''Uses boilerpy3 extractor and regex cribbed from old NLTK clean_html
+    function to try and extract text from HTML as cleanly as possible.
+    Args:
+        html: the HTML string to be cleaned
+    Returns:
+        Cleaned text string'''
+    extractor = extractors.ArticleExtractor()
+    try:
+        html = extractor.get_content(html)
+    except HTMLExtractionError:
+        pass
+    return clean_html(html)
+def clean_html(html: str) -> str:
+    '''
+    Remove HTML markup from the given string.
+    Args:
+        html: the HTML string to be cleaned
+    Returns:
+        Cleaned string
+    '''
+    # First we remove inline JavaScript/CSS:
+    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
+    # Then we remove html comments. This has to be done before removing regular
+    # tags since comments can contain '>' characters.
+    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
+    # Next we can remove the remaining tags:
+    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
+    # Finally, we deal with whitespace
+    cleaned = re.sub(r"&nbsp;", " ", cleaned)
+    cleaned = re.sub(r"  ", " ", cleaned)
+    cleaned = re.sub(r"  ", " ", cleaned)
+    return cleaned.strip()

functions/tools.py CHANGED Viewed

@@ -1,13 +1,9 @@
 '''Tool functions for MCP server'''
 import logging
-from urllib.parse import urlparse
 import functions.helper_functions as helper_funcs
-FEED_URIS = {}
-RSS_EXTENSIONS = ['xml', 'rss', 'atom']
-COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
 def get_content(website: str) -> list:
     '''Gets RSS feed content from a given website.
@@ -23,36 +19,13 @@ def get_content(website: str) -> list:
     logger = logging.getLogger(__name__ + '.get_content')
     logger.info('Getting feed content for: %s', website)
-    # Find the feed URI
-    feed_uri = None
-    # If the website contains xml, rss or atom, assume it's an RSS URI
-    if any(extension in website.lower() for extension in RSS_EXTENSIONS):
-        feed_uri = website
-        logger.info('%s looks like a feed URI already - using it directly', website)
-    # Next, check the cache to see if we alreay have this feed's URI
-    elif website in FEED_URIS.keys():
-        feed_uri = FEED_URIS[website]
-        logger.info('%s feed URI in cache: %s', website, feed_uri)
-    # If neither of those get it - try feedparse if it looks like a url
-    # or else just google it
-    else:
-        if website.split('.')[-1] in COMMON_EXTENSIONS:
-            website_url = website
-            logger.info('%s looks like a website URL', website)
-        else:
-            website_url = helper_funcs.get_url(website)
-            logger.info('Google result for %s: %s', website, website_url)
-        feed_uri = helper_funcs.get_feed(website_url)
-        logger.info('get_feed() returned %s', feed_uri)
-        FEED_URIS[website] = feed_uri
     content = helper_funcs.parse_feed(feed_uri)
-    logger.info('parse_feed() returned %s', content)
-    return '\n'.join(content)

 '''Tool functions for MCP server'''
+import json
 import logging
 import functions.helper_functions as helper_funcs
 def get_content(website: str) -> list:
     '''Gets RSS feed content from a given website.
     logger = logging.getLogger(__name__ + '.get_content')
     logger.info('Getting feed content for: %s', website)
+    feed_uri = helper_funcs.find_feed_uri(website)
+    logger.info('find_feed_uri() returned %s', feed_uri)
+    if 'No feed found' in feed_uri:
+        return 'No feed found'
     content = helper_funcs.parse_feed(feed_uri)
+    logger.info('parse_feed() returned %s entries', len(list(content.keys())))
+    return json.dumps(content)

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 feedparser
 findfeed
 googlesearch-python

+boilerpy3
 feedparser
 findfeed
 googlesearch-python