Merge pull request #14 from gperdrizet/dev
Browse files- assets/html.py +6 -5
- functions/feed_extraction.py +53 -21
- functions/gradio_functions.py +37 -0
- functions/summarization.py +21 -13
- functions/tools.py +15 -3
- rss_server.py +22 -7
assets/html.py
CHANGED
@@ -11,13 +11,14 @@ TITLE = (
|
|
11 |
DESCRIPTION = (
|
12 |
'''
|
13 |
<p>RSS feed reader MCP server. See
|
14 |
-
<a href="https://huggingface.co/spaces/Agents-MCP-Hackathon/rss-mcp-client">
|
15 |
-
for a demonstration. Check out the
|
16 |
-
<a href="https://github.com/gperdrizet/MCP-hackathon/tree/main">
|
17 |
-
Both Spaces by
|
|
|
18 |
|
19 |
<p>This Space is not meant to be used directly, but you can try out the bare tool below.
|
20 |
-
Enter a website name, website URL, or
|
21 |
to find the feed and return titles, links and summaries for the three most recent posts.
|
22 |
Suggestions: http://openai.com/news/rss.xml, hackernews.com, slashdot, etc.</p>
|
23 |
|
|
|
11 |
DESCRIPTION = (
|
12 |
'''
|
13 |
<p>RSS feed reader MCP server. See
|
14 |
+
<a href="https://huggingface.co/spaces/Agents-MCP-Hackathon/rss-mcp-client">
|
15 |
+
Agentic RSS reader</a>for a demonstration. Check out the
|
16 |
+
<a href="https://github.com/gperdrizet/MCP-hackathon/tree/main">
|
17 |
+
main project repo on GitHub</a>. Both Spaces by
|
18 |
+
<a href="https://www.linkedin.com/in/gperdrizet">George Perdrizet</a>.</p>
|
19 |
|
20 |
<p>This Space is not meant to be used directly, but you can try out the bare tool below.
|
21 |
+
Enter a website name, website URL, or feed URI. The tool will do it's best
|
22 |
to find the feed and return titles, links and summaries for the three most recent posts.
|
23 |
Suggestions: http://openai.com/news/rss.xml, hackernews.com, slashdot, etc.</p>
|
24 |
|
functions/feed_extraction.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
'''Helper functions for MCP tools.'''
|
2 |
|
|
|
3 |
import re
|
|
|
4 |
import logging
|
5 |
import urllib.request
|
6 |
from urllib.error import HTTPError, URLError
|
@@ -10,11 +12,15 @@ from boilerpy3 import extractors
|
|
10 |
from boilerpy3.exceptions import HTMLExtractionError
|
11 |
from findfeed import search as feed_search
|
12 |
from googlesearch import search as google_search
|
|
|
13 |
|
14 |
FEED_URIS = {}
|
15 |
RSS_EXTENSIONS = ['xml', 'rss', 'atom']
|
16 |
COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
|
17 |
-
|
|
|
|
|
|
|
18 |
|
19 |
def find_feed_uri(website: str) -> str:
|
20 |
'''Attempts to find URI for RSS feed. First checks if string provided in
|
@@ -42,14 +48,26 @@ def find_feed_uri(website: str) -> str:
|
|
42 |
feed_uri = website
|
43 |
logger.info('%s looks like a feed URI already - using it directly', website)
|
44 |
|
45 |
-
# Next, check the cache to see if we already have this feed's URI
|
46 |
elif website in FEED_URIS:
|
47 |
feed_uri = FEED_URIS[website]
|
48 |
-
logger.info('%s feed URI in cache: %s', website, feed_uri)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
51 |
# or else just google it
|
52 |
-
|
53 |
if website.split('.')[-1] in COMMON_EXTENSIONS:
|
54 |
website_url = website
|
55 |
logger.info('%s looks like a website URL', website)
|
@@ -63,6 +81,10 @@ def find_feed_uri(website: str) -> str:
|
|
63 |
|
64 |
FEED_URIS[website] = feed_uri
|
65 |
|
|
|
|
|
|
|
|
|
66 |
return feed_uri
|
67 |
|
68 |
|
@@ -89,28 +111,38 @@ def parse_feed(feed_uri: str) -> list:
|
|
89 |
|
90 |
if 'title' in entry and 'link' in entry:
|
91 |
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
94 |
|
95 |
-
|
96 |
-
|
97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
-
|
100 |
-
|
101 |
|
102 |
-
|
103 |
-
# summary = _get_text(entry.summary)
|
104 |
-
# entry_content['summary'] = summary
|
105 |
|
106 |
-
|
107 |
-
|
|
|
108 |
|
109 |
-
|
110 |
|
111 |
-
|
112 |
-
|
113 |
-
entry_content
|
114 |
|
115 |
entries[i] = entry_content
|
116 |
|
|
|
1 |
'''Helper functions for MCP tools.'''
|
2 |
|
3 |
+
import os
|
4 |
import re
|
5 |
+
import json
|
6 |
import logging
|
7 |
import urllib.request
|
8 |
from urllib.error import HTTPError, URLError
|
|
|
12 |
from boilerpy3.exceptions import HTMLExtractionError
|
13 |
from findfeed import search as feed_search
|
14 |
from googlesearch import search as google_search
|
15 |
+
from upstash_redis import Redis
|
16 |
|
17 |
FEED_URIS = {}
|
18 |
RSS_EXTENSIONS = ['xml', 'rss', 'atom']
|
19 |
COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
|
20 |
+
REDIS = Redis(
|
21 |
+
url='https://sensible-midge-19304.upstash.io',
|
22 |
+
token=os.environ['UPSTASH_REDIS_KEY']
|
23 |
+
)
|
24 |
|
25 |
def find_feed_uri(website: str) -> str:
|
26 |
'''Attempts to find URI for RSS feed. First checks if string provided in
|
|
|
48 |
feed_uri = website
|
49 |
logger.info('%s looks like a feed URI already - using it directly', website)
|
50 |
|
51 |
+
# Next, check the cache to see if we already have this feed's URI locally
|
52 |
elif website in FEED_URIS:
|
53 |
feed_uri = FEED_URIS[website]
|
54 |
+
logger.info('%s feed URI in local cache: %s', website, feed_uri)
|
55 |
+
|
56 |
+
# Then, check to see if the URI is in the Redis cache
|
57 |
+
cache_key = f"{website.lower().replace(' ', '_')}-feed-uri"
|
58 |
+
cache_hit = False
|
59 |
+
|
60 |
+
if feed_uri is None:
|
61 |
+
cached_uri = REDIS.get(cache_key)
|
62 |
|
63 |
+
if cached_uri:
|
64 |
+
cache_hit = True
|
65 |
+
feed_uri = cached_uri
|
66 |
+
logger.info('%s feed URI in Redis cache: %s', website, feed_uri)
|
67 |
+
|
68 |
+
# If none of those get it - try feedparse if it looks like a url
|
69 |
# or else just google it
|
70 |
+
if feed_uri is None:
|
71 |
if website.split('.')[-1] in COMMON_EXTENSIONS:
|
72 |
website_url = website
|
73 |
logger.info('%s looks like a website URL', website)
|
|
|
81 |
|
82 |
FEED_URIS[website] = feed_uri
|
83 |
|
84 |
+
# Add the feed URI to the redis cache if it wasn't already there
|
85 |
+
if cache_hit is False:
|
86 |
+
REDIS.set(cache_key, feed_uri)
|
87 |
+
|
88 |
return feed_uri
|
89 |
|
90 |
|
|
|
111 |
|
112 |
if 'title' in entry and 'link' in entry:
|
113 |
|
114 |
+
title = entry.title
|
115 |
+
|
116 |
+
# Check the Redis cache for this entry
|
117 |
+
cache_key = title.lower().replace(' ', '_')
|
118 |
+
cache_hit = False
|
119 |
+
cached_entry = REDIS.get(cache_key)
|
120 |
|
121 |
+
if cached_entry:
|
122 |
+
cache_hit = True
|
123 |
+
entry_content = json.loads(cached_entry)
|
124 |
+
logger.info('Entry in Redis cache: "%s"', title)
|
125 |
+
|
126 |
+
# If its not in the Redis cache, parse it from the feed data
|
127 |
+
else:
|
128 |
+
entry_content['title'] = entry.title
|
129 |
+
entry_content['link'] = entry.link
|
130 |
+
entry_content['content'] = None
|
131 |
|
132 |
+
if 'content' in entry:
|
133 |
+
entry_content['content'] = entry.content
|
134 |
|
135 |
+
if entry_content['content'] is None:
|
|
|
|
|
136 |
|
137 |
+
html = _get_html(entry_content['link'])
|
138 |
+
content = _get_text(html)
|
139 |
+
entry_content['content'] = content
|
140 |
|
141 |
+
logger.info('Parsed entry: "%s"', title)
|
142 |
|
143 |
+
# Add it to the Redis cache if it wasn't there
|
144 |
+
if cache_hit is False:
|
145 |
+
REDIS.set(cache_key, entry_content)
|
146 |
|
147 |
entries[i] = entry_content
|
148 |
|
functions/gradio_functions.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''Collection of helper functions for Gradio UI and interface.'''
|
2 |
+
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
|
6 |
+
|
7 |
+
def update_log(n: int = 10):
|
8 |
+
'''Gets updated logging output from disk to display to user.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
n: number of most recent lines of log output to display
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
Logging output as string
|
15 |
+
'''
|
16 |
+
|
17 |
+
with open('logs/rss_server.log', 'r', encoding='utf-8') as log_file:
|
18 |
+
lines = log_file.readlines()
|
19 |
+
|
20 |
+
return ''.join(lines[-n:])
|
21 |
+
|
22 |
+
|
23 |
+
def delete_old_logs(directory:str, basename:str) -> None:
|
24 |
+
'''Deletes old log files from previous optimization sessions, if present.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
directory: path to log file directory as string
|
28 |
+
basename: log file base name as string
|
29 |
+
|
30 |
+
Returns:
|
31 |
+
None
|
32 |
+
'''
|
33 |
+
|
34 |
+
for filename in os.listdir(directory):
|
35 |
+
file_path = os.path.join(directory, filename)
|
36 |
+
if re.search(basename, filename):
|
37 |
+
os.remove(file_path)
|
functions/summarization.py
CHANGED
@@ -4,9 +4,14 @@ import os
|
|
4 |
import logging
|
5 |
|
6 |
from openai import OpenAI
|
|
|
7 |
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
def summarize_content(content: str) -> str:
|
10 |
'''Generates summary of article content using Modal inference endpoint.
|
11 |
|
12 |
Args:
|
@@ -19,6 +24,15 @@ def summarize_content(content: str) -> str:
|
|
19 |
logger = logging.getLogger(__name__ + '.summarize_content')
|
20 |
logger.info('Summarizing extracted content')
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
|
23 |
|
24 |
client.base_url = (
|
@@ -29,16 +43,6 @@ def summarize_content(content: str) -> str:
|
|
29 |
model = client.models.list().data[0]
|
30 |
model_id = model.id
|
31 |
|
32 |
-
# messages = [
|
33 |
-
# {
|
34 |
-
# 'role': 'system',
|
35 |
-
# 'content': ('You are a research assistant, skilled in summarizing documents in just '+
|
36 |
-
# 'a few sentences. Your document summaries should be a maximum of 2 to 4 sentences long.'),
|
37 |
-
# 'role': 'user',
|
38 |
-
# 'content': content
|
39 |
-
# }
|
40 |
-
# ]
|
41 |
-
|
42 |
messages = [
|
43 |
{
|
44 |
'role': 'system',
|
@@ -68,7 +72,11 @@ def summarize_content(content: str) -> str:
|
|
68 |
logger.error('Error during Modal API call: %s', e)
|
69 |
|
70 |
if response is not None:
|
71 |
-
|
72 |
|
73 |
else:
|
74 |
-
|
|
|
|
|
|
|
|
|
|
4 |
import logging
|
5 |
|
6 |
from openai import OpenAI
|
7 |
+
from upstash_redis import Redis
|
8 |
|
9 |
+
REDIS = Redis(
|
10 |
+
url='https://sensible-midge-19304.upstash.io',
|
11 |
+
token=os.environ['UPSTASH_REDIS_KEY']
|
12 |
+
)
|
13 |
|
14 |
+
def summarize_content(title: str, content: str) -> str:
|
15 |
'''Generates summary of article content using Modal inference endpoint.
|
16 |
|
17 |
Args:
|
|
|
24 |
logger = logging.getLogger(__name__ + '.summarize_content')
|
25 |
logger.info('Summarizing extracted content')
|
26 |
|
27 |
+
# Check Redis cache for summary
|
28 |
+
cache_key = f"{title.lower().replace(' ', '_')}-summary"
|
29 |
+
cached_summary = REDIS.get(cache_key)
|
30 |
+
|
31 |
+
if cached_summary:
|
32 |
+
logger.info('Got summary from Redis cache: "%s"', title)
|
33 |
+
return cached_summary
|
34 |
+
|
35 |
+
# It the summary is not in the cache, generate it
|
36 |
client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
|
37 |
|
38 |
client.base_url = (
|
|
|
43 |
model = client.models.list().data[0]
|
44 |
model_id = model.id
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
messages = [
|
47 |
{
|
48 |
'role': 'system',
|
|
|
72 |
logger.error('Error during Modal API call: %s', e)
|
73 |
|
74 |
if response is not None:
|
75 |
+
summary = response.choices[0].message.content
|
76 |
|
77 |
else:
|
78 |
+
summary = None
|
79 |
+
|
80 |
+
REDIS.set(cache_key, summary)
|
81 |
+
logger.info('Summarized: "%s"', title)
|
82 |
+
return summary
|
functions/tools.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
'''Tool functions for MCP server'''
|
2 |
|
|
|
3 |
import json
|
4 |
import logging
|
5 |
import functions.feed_extraction as extraction_funcs
|
@@ -10,7 +11,7 @@ def get_feed(website: str) -> list:
|
|
10 |
'''Gets RSS feed content from a given website. Can take a website or RSS
|
11 |
feed URL directly, or the name of a website. Will attempt to find RSS
|
12 |
feed and return title, summary and link to full article for most recent
|
13 |
-
items in feed
|
14 |
|
15 |
Args:
|
16 |
website: URL or name of website to extract RSS feed content from
|
@@ -20,24 +21,35 @@ def get_feed(website: str) -> list:
|
|
20 |
feed for the requested website could not be found
|
21 |
'''
|
22 |
|
23 |
-
|
|
|
|
|
24 |
logger.info('Getting feed content for: %s', website)
|
25 |
|
|
|
26 |
feed_uri = extraction_funcs.find_feed_uri(website)
|
27 |
logger.info('find_feed_uri() returned %s', feed_uri)
|
28 |
|
29 |
if 'No feed found' in feed_uri:
|
|
|
30 |
return 'No feed found'
|
31 |
|
|
|
32 |
content = extraction_funcs.parse_feed(feed_uri)
|
33 |
logger.info('parse_feed() returned %s entries', len(list(content.keys())))
|
34 |
|
|
|
35 |
for i, item in content.items():
|
36 |
|
37 |
if item['content'] is not None:
|
38 |
-
summary = summarization_funcs.summarize_content(
|
|
|
|
|
|
|
39 |
content[i]['summary'] = summary
|
40 |
|
41 |
content[i].pop('content', None)
|
42 |
|
|
|
|
|
43 |
return json.dumps(content)
|
|
|
1 |
'''Tool functions for MCP server'''
|
2 |
|
3 |
+
import time
|
4 |
import json
|
5 |
import logging
|
6 |
import functions.feed_extraction as extraction_funcs
|
|
|
11 |
'''Gets RSS feed content from a given website. Can take a website or RSS
|
12 |
feed URL directly, or the name of a website. Will attempt to find RSS
|
13 |
feed and return title, summary and link to full article for most recent
|
14 |
+
items in feed.
|
15 |
|
16 |
Args:
|
17 |
website: URL or name of website to extract RSS feed content from
|
|
|
21 |
feed for the requested website could not be found
|
22 |
'''
|
23 |
|
24 |
+
start_time = time.time()
|
25 |
+
|
26 |
+
logger = logging.getLogger(__name__ + '.get_feed()')
|
27 |
logger.info('Getting feed content for: %s', website)
|
28 |
|
29 |
+
# Find the feed's URI from the website name/URL
|
30 |
feed_uri = extraction_funcs.find_feed_uri(website)
|
31 |
logger.info('find_feed_uri() returned %s', feed_uri)
|
32 |
|
33 |
if 'No feed found' in feed_uri:
|
34 |
+
logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
|
35 |
return 'No feed found'
|
36 |
|
37 |
+
# Parse and extract content from the feed
|
38 |
content = extraction_funcs.parse_feed(feed_uri)
|
39 |
logger.info('parse_feed() returned %s entries', len(list(content.keys())))
|
40 |
|
41 |
+
# Summarize each post in the feed
|
42 |
for i, item in content.items():
|
43 |
|
44 |
if item['content'] is not None:
|
45 |
+
summary = summarization_funcs.summarize_content(
|
46 |
+
item['title'],
|
47 |
+
item['content']
|
48 |
+
)
|
49 |
content[i]['summary'] = summary
|
50 |
|
51 |
content[i].pop('content', None)
|
52 |
|
53 |
+
logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
|
54 |
+
|
55 |
return json.dumps(content)
|
rss_server.py
CHANGED
@@ -1,19 +1,24 @@
|
|
1 |
'''Main script to run gradio interface and MCP server.'''
|
2 |
|
3 |
import logging
|
|
|
4 |
from pathlib import Path
|
5 |
from logging.handlers import RotatingFileHandler
|
6 |
|
7 |
import gradio as gr
|
8 |
import assets.html as html
|
9 |
-
|
|
|
10 |
|
|
|
|
|
11 |
# Make sure log directory exists
|
12 |
Path('logs').mkdir(parents=True, exist_ok=True)
|
13 |
|
14 |
-
#
|
15 |
-
|
16 |
|
|
|
17 |
logging.basicConfig(
|
18 |
handlers=[RotatingFileHandler(
|
19 |
'logs/rss_server.log',
|
@@ -25,15 +30,25 @@ logging.basicConfig(
|
|
25 |
format='%(levelname)s - %(name)s - %(message)s'
|
26 |
)
|
27 |
|
|
|
28 |
logger = logging.getLogger(__name__)
|
29 |
|
30 |
-
|
31 |
with gr.Blocks() as demo:
|
32 |
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
-
|
37 |
website_url = gr.Textbox('hackernews.com', label='Website')
|
38 |
output = gr.Textbox(label='RSS entries', lines=10)
|
39 |
submit_button = gr.Button('Submit')
|
|
|
1 |
'''Main script to run gradio interface and MCP server.'''
|
2 |
|
3 |
import logging
|
4 |
+
from functools import partial
|
5 |
from pathlib import Path
|
6 |
from logging.handlers import RotatingFileHandler
|
7 |
|
8 |
import gradio as gr
|
9 |
import assets.html as html
|
10 |
+
import functions.tools as tool_funcs
|
11 |
+
import functions.gradio_functions as gradio_funcs
|
12 |
|
13 |
+
|
14 |
+
# Set-up logging
|
15 |
# Make sure log directory exists
|
16 |
Path('logs').mkdir(parents=True, exist_ok=True)
|
17 |
|
18 |
+
# Clear old logs if present
|
19 |
+
gradio_funcs.delete_old_logs('logs', 'rss_server')
|
20 |
|
21 |
+
# Set up the root logger so we catch logs from everything
|
22 |
logging.basicConfig(
|
23 |
handlers=[RotatingFileHandler(
|
24 |
'logs/rss_server.log',
|
|
|
30 |
format='%(levelname)s - %(name)s - %(message)s'
|
31 |
)
|
32 |
|
33 |
+
# Get a logger
|
34 |
logger = logging.getLogger(__name__)
|
35 |
|
|
|
36 |
with gr.Blocks() as demo:
|
37 |
|
38 |
+
# Page text
|
39 |
+
gr.HTML(html.TITLE)
|
40 |
+
gr.HTML(html.DESCRIPTION)
|
41 |
+
|
42 |
+
# Log output
|
43 |
+
dialog_output = gr.Textbox(label='Server logs', lines=10, max_lines=100)
|
44 |
+
timer = gr.Timer(0.5, active=True)
|
45 |
+
|
46 |
+
timer.tick( # pylint: disable=no-member
|
47 |
+
lambda: gradio_funcs.update_log(), # pylint: disable=unnecessary-lambda
|
48 |
+
outputs=dialog_output
|
49 |
+
)
|
50 |
|
51 |
+
# Get feed tool
|
52 |
website_url = gr.Textbox('hackernews.com', label='Website')
|
53 |
output = gr.Textbox(label='RSS entries', lines=10)
|
54 |
submit_button = gr.Button('Submit')
|