Added caching of feed content and URI.
Browse files- functions/feed_extraction.py +27 -5
- functions/tools.py +36 -13
- rss_server.py +3 -2
functions/feed_extraction.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
'''Helper functions for MCP tools.'''
|
2 |
|
|
|
3 |
import re
|
|
|
4 |
import logging
|
5 |
import urllib.request
|
6 |
from urllib.error import HTTPError, URLError
|
@@ -10,11 +12,15 @@ from boilerpy3 import extractors
|
|
10 |
from boilerpy3.exceptions import HTMLExtractionError
|
11 |
from findfeed import search as feed_search
|
12 |
from googlesearch import search as google_search
|
|
|
13 |
|
14 |
FEED_URIS = {}
|
15 |
RSS_EXTENSIONS = ['xml', 'rss', 'atom']
|
16 |
COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
|
17 |
-
|
|
|
|
|
|
|
18 |
|
19 |
def find_feed_uri(website: str) -> str:
|
20 |
'''Attempts to find URI for RSS feed. First checks if string provided in
|
@@ -42,14 +48,26 @@ def find_feed_uri(website: str) -> str:
|
|
42 |
feed_uri = website
|
43 |
logger.info('%s looks like a feed URI already - using it directly', website)
|
44 |
|
45 |
-
# Next, check the cache to see if we already have this feed's URI
|
46 |
elif website in FEED_URIS:
|
47 |
feed_uri = FEED_URIS[website]
|
48 |
-
logger.info('%s feed URI in cache: %s', website, feed_uri)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
51 |
# or else just google it
|
52 |
-
|
53 |
if website.split('.')[-1] in COMMON_EXTENSIONS:
|
54 |
website_url = website
|
55 |
logger.info('%s looks like a website URL', website)
|
@@ -63,6 +81,10 @@ def find_feed_uri(website: str) -> str:
|
|
63 |
|
64 |
FEED_URIS[website] = feed_uri
|
65 |
|
|
|
|
|
|
|
|
|
66 |
return feed_uri
|
67 |
|
68 |
|
|
|
1 |
'''Helper functions for MCP tools.'''
|
2 |
|
3 |
+
import os
|
4 |
import re
|
5 |
+
import json
|
6 |
import logging
|
7 |
import urllib.request
|
8 |
from urllib.error import HTTPError, URLError
|
|
|
12 |
from boilerpy3.exceptions import HTMLExtractionError
|
13 |
from findfeed import search as feed_search
|
14 |
from googlesearch import search as google_search
|
15 |
+
from upstash_redis import Redis
|
16 |
|
17 |
FEED_URIS = {}
|
18 |
RSS_EXTENSIONS = ['xml', 'rss', 'atom']
|
19 |
COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
|
20 |
+
REDIS = Redis(
|
21 |
+
url='https://sensible-midge-19304.upstash.io',
|
22 |
+
token=os.environ['UPSTASH_KEY']
|
23 |
+
)
|
24 |
|
25 |
def find_feed_uri(website: str) -> str:
|
26 |
'''Attempts to find URI for RSS feed. First checks if string provided in
|
|
|
48 |
feed_uri = website
|
49 |
logger.info('%s looks like a feed URI already - using it directly', website)
|
50 |
|
51 |
+
# Next, check the cache to see if we already have this feed's URI locally
|
52 |
elif website in FEED_URIS:
|
53 |
feed_uri = FEED_URIS[website]
|
54 |
+
logger.info('%s feed URI in local cache: %s', website, feed_uri)
|
55 |
+
|
56 |
+
# Then, check to see if the URI is in the Redis cache
|
57 |
+
cache_key = f"{website.lower().replace(' ', '_')}-feed-uri"
|
58 |
+
cache_hit = False
|
59 |
+
|
60 |
+
if feed_uri is None:
|
61 |
+
cached_uri = REDIS.get(cache_key)
|
62 |
|
63 |
+
if cached_uri:
|
64 |
+
cache_hit = True
|
65 |
+
feed_uri = cached_uri
|
66 |
+
logger.info('%s feed URI in Redis cache: %s', website, feed_uri)
|
67 |
+
|
68 |
+
# If none of those get it - try feedparse if it looks like a url
|
69 |
# or else just google it
|
70 |
+
if feed_uri is None:
|
71 |
if website.split('.')[-1] in COMMON_EXTENSIONS:
|
72 |
website_url = website
|
73 |
logger.info('%s looks like a website URL', website)
|
|
|
81 |
|
82 |
FEED_URIS[website] = feed_uri
|
83 |
|
84 |
+
# Add the feed URI to the redis cache if it wasn't already there
|
85 |
+
if cache_hit is False:
|
86 |
+
REDIS.set(cache_key, feed_uri)
|
87 |
+
|
88 |
return feed_uri
|
89 |
|
90 |
|
functions/tools.py
CHANGED
@@ -1,12 +1,16 @@
|
|
1 |
'''Tool functions for MCP server'''
|
2 |
|
|
|
3 |
import json
|
4 |
import logging
|
5 |
import functions.feed_extraction as extraction_funcs
|
6 |
import functions.summarization as summarization_funcs
|
7 |
|
|
|
|
|
|
|
8 |
|
9 |
-
def get_feed(website: str) -> list:
|
10 |
'''Gets RSS feed content from a given website. Can take a website or RSS
|
11 |
feed URL directly, or the name of a website. Will attempt to find RSS
|
12 |
feed and return title, summary and link to full article for most recent
|
@@ -14,30 +18,49 @@ def get_feed(website: str) -> list:
|
|
14 |
|
15 |
Args:
|
16 |
website: URL or name of website to extract RSS feed content from
|
|
|
|
|
17 |
|
18 |
Returns:
|
19 |
JSON string containing the feed content or 'No feed found' if a RSS
|
20 |
feed for the requested website could not be found
|
21 |
'''
|
22 |
|
23 |
-
|
|
|
|
|
24 |
logger.info('Getting feed content for: %s', website)
|
25 |
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
|
30 |
-
|
31 |
|
32 |
-
|
33 |
-
|
|
|
34 |
|
35 |
-
|
36 |
|
37 |
-
|
38 |
-
summary = summarization_funcs.summarize_content(item['content'])
|
39 |
-
content[i]['summary'] = summary
|
40 |
|
41 |
-
|
42 |
|
43 |
return json.dumps(content)
|
|
|
1 |
'''Tool functions for MCP server'''
|
2 |
|
3 |
+
import time
|
4 |
import json
|
5 |
import logging
|
6 |
import functions.feed_extraction as extraction_funcs
|
7 |
import functions.summarization as summarization_funcs
|
8 |
|
9 |
+
LOCAL_CACHE = {
|
10 |
+
'get_feed': {}
|
11 |
+
}
|
12 |
|
13 |
+
def get_feed(website: str, use_cache: bool = True) -> list:
|
14 |
'''Gets RSS feed content from a given website. Can take a website or RSS
|
15 |
feed URL directly, or the name of a website. Will attempt to find RSS
|
16 |
feed and return title, summary and link to full article for most recent
|
|
|
18 |
|
19 |
Args:
|
20 |
website: URL or name of website to extract RSS feed content from
|
21 |
+
use_cache: check local cache for content from RSS feed first before
|
22 |
+
downloading data from the website's RSS feed
|
23 |
|
24 |
Returns:
|
25 |
JSON string containing the feed content or 'No feed found' if a RSS
|
26 |
feed for the requested website could not be found
|
27 |
'''
|
28 |
|
29 |
+
start_time = time.time()
|
30 |
+
|
31 |
+
logger = logging.getLogger(__name__ + '.get_feed()')
|
32 |
logger.info('Getting feed content for: %s', website)
|
33 |
|
34 |
+
# Check to see if we have this feed cached, if desired
|
35 |
+
if use_cache is True and website in LOCAL_CACHE['get_feed']:
|
36 |
+
content = LOCAL_CACHE['get_feed'][website]
|
37 |
+
logger.info('Got feed content from local cache')
|
38 |
+
|
39 |
+
else:
|
40 |
+
|
41 |
+
# Find the feed's URI from the website name/URL
|
42 |
+
feed_uri = extraction_funcs.find_feed_uri(website)
|
43 |
+
logger.info('find_feed_uri() returned %s', feed_uri)
|
44 |
+
|
45 |
+
if 'No feed found' in feed_uri:
|
46 |
+
logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
|
47 |
+
return 'No feed found'
|
48 |
+
|
49 |
+
# Parse and extract content from the feed
|
50 |
+
content = extraction_funcs.parse_feed(feed_uri)
|
51 |
+
logger.info('parse_feed() returned %s entries', len(list(content.keys())))
|
52 |
|
53 |
+
# Summarize each post in the feed
|
54 |
+
for i, item in content.items():
|
55 |
|
56 |
+
if item['content'] is not None:
|
57 |
+
summary = summarization_funcs.summarize_content(item['content'])
|
58 |
+
content[i]['summary'] = summary
|
59 |
|
60 |
+
content[i].pop('content', None)
|
61 |
|
62 |
+
LOCAL_CACHE['get_feed'][website] = content
|
|
|
|
|
63 |
|
64 |
+
logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
|
65 |
|
66 |
return json.dumps(content)
|
rss_server.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
'''Main script to run gradio interface and MCP server.'''
|
2 |
|
3 |
import logging
|
|
|
4 |
from pathlib import Path
|
5 |
from logging.handlers import RotatingFileHandler
|
6 |
|
@@ -17,7 +18,7 @@ Path('logs').mkdir(parents=True, exist_ok=True)
|
|
17 |
# Clear old logs if present
|
18 |
gradio_funcs.delete_old_logs('logs', 'rss_server')
|
19 |
|
20 |
-
# Set up the root logger so we catch logs from
|
21 |
logging.basicConfig(
|
22 |
handlers=[RotatingFileHandler(
|
23 |
'logs/rss_server.log',
|
@@ -29,9 +30,9 @@ logging.basicConfig(
|
|
29 |
format='%(levelname)s - %(name)s - %(message)s'
|
30 |
)
|
31 |
|
|
|
32 |
logger = logging.getLogger(__name__)
|
33 |
|
34 |
-
|
35 |
with gr.Blocks() as demo:
|
36 |
|
37 |
# Page text
|
|
|
1 |
'''Main script to run gradio interface and MCP server.'''
|
2 |
|
3 |
import logging
|
4 |
+
from functools import partial
|
5 |
from pathlib import Path
|
6 |
from logging.handlers import RotatingFileHandler
|
7 |
|
|
|
18 |
# Clear old logs if present
|
19 |
gradio_funcs.delete_old_logs('logs', 'rss_server')
|
20 |
|
21 |
+
# Set up the root logger so we catch logs from everything
|
22 |
logging.basicConfig(
|
23 |
handlers=[RotatingFileHandler(
|
24 |
'logs/rss_server.log',
|
|
|
30 |
format='%(levelname)s - %(name)s - %(message)s'
|
31 |
)
|
32 |
|
33 |
+
# Get a logger
|
34 |
logger = logging.getLogger(__name__)
|
35 |
|
|
|
36 |
with gr.Blocks() as demo:
|
37 |
|
38 |
# Page text
|