gperdrizet commited on
Commit
3ba8e58
·
unverified ·
2 Parent(s): 933ca58 dc68696

Merge pull request #14 from gperdrizet/dev

Browse files
assets/html.py CHANGED
@@ -11,13 +11,14 @@ TITLE = (
11
  DESCRIPTION = (
12
  '''
13
  <p>RSS feed reader MCP server. See
14
- <a href="https://huggingface.co/spaces/Agents-MCP-Hackathon/rss-mcp-client">Agentic RSS reader</a>
15
- for a demonstration. Check out the
16
- <a href="https://github.com/gperdrizet/MCP-hackathon/tree/main">main project repo on GitHub</a>.
17
- Both Spaces by <a href="https://www.linkedin.com/in/gperdrizet">George Perdrizet</a>.</p>
 
18
 
19
  <p>This Space is not meant to be used directly, but you can try out the bare tool below.
20
- Enter a website name, website URL, or favorite feed URI. The tool will do it's best
21
  to find the feed and return titles, links and summaries for the three most recent posts.
22
  Suggestions: http://openai.com/news/rss.xml, hackernews.com, slashdot, etc.</p>
23
 
 
11
  DESCRIPTION = (
12
  '''
13
  <p>RSS feed reader MCP server. See
14
+ <a href="https://huggingface.co/spaces/Agents-MCP-Hackathon/rss-mcp-client">
15
+ Agentic RSS reader</a>for a demonstration. Check out the
16
+ <a href="https://github.com/gperdrizet/MCP-hackathon/tree/main">
17
+ main project repo on GitHub</a>. Both Spaces by
18
+ <a href="https://www.linkedin.com/in/gperdrizet">George Perdrizet</a>.</p>
19
 
20
  <p>This Space is not meant to be used directly, but you can try out the bare tool below.
21
+ Enter a website name, website URL, or feed URI. The tool will do it's best
22
  to find the feed and return titles, links and summaries for the three most recent posts.
23
  Suggestions: http://openai.com/news/rss.xml, hackernews.com, slashdot, etc.</p>
24
 
functions/feed_extraction.py CHANGED
@@ -1,6 +1,8 @@
1
  '''Helper functions for MCP tools.'''
2
 
 
3
  import re
 
4
  import logging
5
  import urllib.request
6
  from urllib.error import HTTPError, URLError
@@ -10,11 +12,15 @@ from boilerpy3 import extractors
10
  from boilerpy3.exceptions import HTMLExtractionError
11
  from findfeed import search as feed_search
12
  from googlesearch import search as google_search
 
13
 
14
  FEED_URIS = {}
15
  RSS_EXTENSIONS = ['xml', 'rss', 'atom']
16
  COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
17
-
 
 
 
18
 
19
  def find_feed_uri(website: str) -> str:
20
  '''Attempts to find URI for RSS feed. First checks if string provided in
@@ -42,14 +48,26 @@ def find_feed_uri(website: str) -> str:
42
  feed_uri = website
43
  logger.info('%s looks like a feed URI already - using it directly', website)
44
 
45
- # Next, check the cache to see if we already have this feed's URI
46
  elif website in FEED_URIS:
47
  feed_uri = FEED_URIS[website]
48
- logger.info('%s feed URI in cache: %s', website, feed_uri)
 
 
 
 
 
 
 
49
 
50
- # If neither of those get it - try feedparse if it looks like a url
 
 
 
 
 
51
  # or else just google it
52
- else:
53
  if website.split('.')[-1] in COMMON_EXTENSIONS:
54
  website_url = website
55
  logger.info('%s looks like a website URL', website)
@@ -63,6 +81,10 @@ def find_feed_uri(website: str) -> str:
63
 
64
  FEED_URIS[website] = feed_uri
65
 
 
 
 
 
66
  return feed_uri
67
 
68
 
@@ -89,28 +111,38 @@ def parse_feed(feed_uri: str) -> list:
89
 
90
  if 'title' in entry and 'link' in entry:
91
 
92
- entry_content['title'] = entry.title
93
- entry_content['link'] = entry.link
 
 
 
 
94
 
95
- # entry_content['updated'] = None
96
- # entry_content['summary'] = None
97
- entry_content['content'] = None
 
 
 
 
 
 
 
98
 
99
- # if 'updated' in entry:
100
- # entry_content['updated'] = entry.updated
101
 
102
- # if 'summary' in entry:
103
- # summary = _get_text(entry.summary)
104
- # entry_content['summary'] = summary
105
 
106
- if 'content' in entry:
107
- entry_content['content'] = entry.content
 
108
 
109
- if entry_content['content'] is None:
110
 
111
- html = _get_html(entry_content['link'])
112
- content = _get_text(html)
113
- entry_content['content'] = content
114
 
115
  entries[i] = entry_content
116
 
 
1
  '''Helper functions for MCP tools.'''
2
 
3
+ import os
4
  import re
5
+ import json
6
  import logging
7
  import urllib.request
8
  from urllib.error import HTTPError, URLError
 
12
  from boilerpy3.exceptions import HTMLExtractionError
13
  from findfeed import search as feed_search
14
  from googlesearch import search as google_search
15
+ from upstash_redis import Redis
16
 
17
  FEED_URIS = {}
18
  RSS_EXTENSIONS = ['xml', 'rss', 'atom']
19
  COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
20
+ REDIS = Redis(
21
+ url='https://sensible-midge-19304.upstash.io',
22
+ token=os.environ['UPSTASH_REDIS_KEY']
23
+ )
24
 
25
  def find_feed_uri(website: str) -> str:
26
  '''Attempts to find URI for RSS feed. First checks if string provided in
 
48
  feed_uri = website
49
  logger.info('%s looks like a feed URI already - using it directly', website)
50
 
51
+ # Next, check the cache to see if we already have this feed's URI locally
52
  elif website in FEED_URIS:
53
  feed_uri = FEED_URIS[website]
54
+ logger.info('%s feed URI in local cache: %s', website, feed_uri)
55
+
56
+ # Then, check to see if the URI is in the Redis cache
57
+ cache_key = f"{website.lower().replace(' ', '_')}-feed-uri"
58
+ cache_hit = False
59
+
60
+ if feed_uri is None:
61
+ cached_uri = REDIS.get(cache_key)
62
 
63
+ if cached_uri:
64
+ cache_hit = True
65
+ feed_uri = cached_uri
66
+ logger.info('%s feed URI in Redis cache: %s', website, feed_uri)
67
+
68
+ # If none of those get it - try feedparse if it looks like a url
69
  # or else just google it
70
+ if feed_uri is None:
71
  if website.split('.')[-1] in COMMON_EXTENSIONS:
72
  website_url = website
73
  logger.info('%s looks like a website URL', website)
 
81
 
82
  FEED_URIS[website] = feed_uri
83
 
84
+ # Add the feed URI to the redis cache if it wasn't already there
85
+ if cache_hit is False:
86
+ REDIS.set(cache_key, feed_uri)
87
+
88
  return feed_uri
89
 
90
 
 
111
 
112
  if 'title' in entry and 'link' in entry:
113
 
114
+ title = entry.title
115
+
116
+ # Check the Redis cache for this entry
117
+ cache_key = title.lower().replace(' ', '_')
118
+ cache_hit = False
119
+ cached_entry = REDIS.get(cache_key)
120
 
121
+ if cached_entry:
122
+ cache_hit = True
123
+ entry_content = json.loads(cached_entry)
124
+ logger.info('Entry in Redis cache: "%s"', title)
125
+
126
+ # If its not in the Redis cache, parse it from the feed data
127
+ else:
128
+ entry_content['title'] = entry.title
129
+ entry_content['link'] = entry.link
130
+ entry_content['content'] = None
131
 
132
+ if 'content' in entry:
133
+ entry_content['content'] = entry.content
134
 
135
+ if entry_content['content'] is None:
 
 
136
 
137
+ html = _get_html(entry_content['link'])
138
+ content = _get_text(html)
139
+ entry_content['content'] = content
140
 
141
+ logger.info('Parsed entry: "%s"', title)
142
 
143
+ # Add it to the Redis cache if it wasn't there
144
+ if cache_hit is False:
145
+ REDIS.set(cache_key, entry_content)
146
 
147
  entries[i] = entry_content
148
 
functions/gradio_functions.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''Collection of helper functions for Gradio UI and interface.'''
2
+
3
+ import os
4
+ import re
5
+
6
+
7
+ def update_log(n: int = 10):
8
+ '''Gets updated logging output from disk to display to user.
9
+
10
+ Args:
11
+ n: number of most recent lines of log output to display
12
+
13
+ Returns:
14
+ Logging output as string
15
+ '''
16
+
17
+ with open('logs/rss_server.log', 'r', encoding='utf-8') as log_file:
18
+ lines = log_file.readlines()
19
+
20
+ return ''.join(lines[-n:])
21
+
22
+
23
+ def delete_old_logs(directory:str, basename:str) -> None:
24
+ '''Deletes old log files from previous optimization sessions, if present.
25
+
26
+ Args:
27
+ directory: path to log file directory as string
28
+ basename: log file base name as string
29
+
30
+ Returns:
31
+ None
32
+ '''
33
+
34
+ for filename in os.listdir(directory):
35
+ file_path = os.path.join(directory, filename)
36
+ if re.search(basename, filename):
37
+ os.remove(file_path)
functions/summarization.py CHANGED
@@ -4,9 +4,14 @@ import os
4
  import logging
5
 
6
  from openai import OpenAI
 
7
 
 
 
 
 
8
 
9
- def summarize_content(content: str) -> str:
10
  '''Generates summary of article content using Modal inference endpoint.
11
 
12
  Args:
@@ -19,6 +24,15 @@ def summarize_content(content: str) -> str:
19
  logger = logging.getLogger(__name__ + '.summarize_content')
20
  logger.info('Summarizing extracted content')
21
 
 
 
 
 
 
 
 
 
 
22
  client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
23
 
24
  client.base_url = (
@@ -29,16 +43,6 @@ def summarize_content(content: str) -> str:
29
  model = client.models.list().data[0]
30
  model_id = model.id
31
 
32
- # messages = [
33
- # {
34
- # 'role': 'system',
35
- # 'content': ('You are a research assistant, skilled in summarizing documents in just '+
36
- # 'a few sentences. Your document summaries should be a maximum of 2 to 4 sentences long.'),
37
- # 'role': 'user',
38
- # 'content': content
39
- # }
40
- # ]
41
-
42
  messages = [
43
  {
44
  'role': 'system',
@@ -68,7 +72,11 @@ def summarize_content(content: str) -> str:
68
  logger.error('Error during Modal API call: %s', e)
69
 
70
  if response is not None:
71
- return response.choices[0].message.content
72
 
73
  else:
74
- return None
 
 
 
 
 
4
  import logging
5
 
6
  from openai import OpenAI
7
+ from upstash_redis import Redis
8
 
9
+ REDIS = Redis(
10
+ url='https://sensible-midge-19304.upstash.io',
11
+ token=os.environ['UPSTASH_REDIS_KEY']
12
+ )
13
 
14
+ def summarize_content(title: str, content: str) -> str:
15
  '''Generates summary of article content using Modal inference endpoint.
16
 
17
  Args:
 
24
  logger = logging.getLogger(__name__ + '.summarize_content')
25
  logger.info('Summarizing extracted content')
26
 
27
+ # Check Redis cache for summary
28
+ cache_key = f"{title.lower().replace(' ', '_')}-summary"
29
+ cached_summary = REDIS.get(cache_key)
30
+
31
+ if cached_summary:
32
+ logger.info('Got summary from Redis cache: "%s"', title)
33
+ return cached_summary
34
+
35
+ # It the summary is not in the cache, generate it
36
  client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
37
 
38
  client.base_url = (
 
43
  model = client.models.list().data[0]
44
  model_id = model.id
45
 
 
 
 
 
 
 
 
 
 
 
46
  messages = [
47
  {
48
  'role': 'system',
 
72
  logger.error('Error during Modal API call: %s', e)
73
 
74
  if response is not None:
75
+ summary = response.choices[0].message.content
76
 
77
  else:
78
+ summary = None
79
+
80
+ REDIS.set(cache_key, summary)
81
+ logger.info('Summarized: "%s"', title)
82
+ return summary
functions/tools.py CHANGED
@@ -1,5 +1,6 @@
1
  '''Tool functions for MCP server'''
2
 
 
3
  import json
4
  import logging
5
  import functions.feed_extraction as extraction_funcs
@@ -10,7 +11,7 @@ def get_feed(website: str) -> list:
10
  '''Gets RSS feed content from a given website. Can take a website or RSS
11
  feed URL directly, or the name of a website. Will attempt to find RSS
12
  feed and return title, summary and link to full article for most recent
13
- items in feed
14
 
15
  Args:
16
  website: URL or name of website to extract RSS feed content from
@@ -20,24 +21,35 @@ def get_feed(website: str) -> list:
20
  feed for the requested website could not be found
21
  '''
22
 
23
- logger = logging.getLogger(__name__ + '.get_content')
 
 
24
  logger.info('Getting feed content for: %s', website)
25
 
 
26
  feed_uri = extraction_funcs.find_feed_uri(website)
27
  logger.info('find_feed_uri() returned %s', feed_uri)
28
 
29
  if 'No feed found' in feed_uri:
 
30
  return 'No feed found'
31
 
 
32
  content = extraction_funcs.parse_feed(feed_uri)
33
  logger.info('parse_feed() returned %s entries', len(list(content.keys())))
34
 
 
35
  for i, item in content.items():
36
 
37
  if item['content'] is not None:
38
- summary = summarization_funcs.summarize_content(item['content'])
 
 
 
39
  content[i]['summary'] = summary
40
 
41
  content[i].pop('content', None)
42
 
 
 
43
  return json.dumps(content)
 
1
  '''Tool functions for MCP server'''
2
 
3
+ import time
4
  import json
5
  import logging
6
  import functions.feed_extraction as extraction_funcs
 
11
  '''Gets RSS feed content from a given website. Can take a website or RSS
12
  feed URL directly, or the name of a website. Will attempt to find RSS
13
  feed and return title, summary and link to full article for most recent
14
+ items in feed.
15
 
16
  Args:
17
  website: URL or name of website to extract RSS feed content from
 
21
  feed for the requested website could not be found
22
  '''
23
 
24
+ start_time = time.time()
25
+
26
+ logger = logging.getLogger(__name__ + '.get_feed()')
27
  logger.info('Getting feed content for: %s', website)
28
 
29
+ # Find the feed's URI from the website name/URL
30
  feed_uri = extraction_funcs.find_feed_uri(website)
31
  logger.info('find_feed_uri() returned %s', feed_uri)
32
 
33
  if 'No feed found' in feed_uri:
34
+ logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
35
  return 'No feed found'
36
 
37
+ # Parse and extract content from the feed
38
  content = extraction_funcs.parse_feed(feed_uri)
39
  logger.info('parse_feed() returned %s entries', len(list(content.keys())))
40
 
41
+ # Summarize each post in the feed
42
  for i, item in content.items():
43
 
44
  if item['content'] is not None:
45
+ summary = summarization_funcs.summarize_content(
46
+ item['title'],
47
+ item['content']
48
+ )
49
  content[i]['summary'] = summary
50
 
51
  content[i].pop('content', None)
52
 
53
+ logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
54
+
55
  return json.dumps(content)
rss_server.py CHANGED
@@ -1,19 +1,24 @@
1
  '''Main script to run gradio interface and MCP server.'''
2
 
3
  import logging
 
4
  from pathlib import Path
5
  from logging.handlers import RotatingFileHandler
6
 
7
  import gradio as gr
8
  import assets.html as html
9
- from functions import tools as tool_funcs
 
10
 
 
 
11
  # Make sure log directory exists
12
  Path('logs').mkdir(parents=True, exist_ok=True)
13
 
14
- # Set-up logger
15
- logger = logging.getLogger()
16
 
 
17
  logging.basicConfig(
18
  handlers=[RotatingFileHandler(
19
  'logs/rss_server.log',
@@ -25,15 +30,25 @@ logging.basicConfig(
25
  format='%(levelname)s - %(name)s - %(message)s'
26
  )
27
 
 
28
  logger = logging.getLogger(__name__)
29
 
30
-
31
  with gr.Blocks() as demo:
32
 
33
- with gr.Row():
34
- gr.HTML(html.TITLE)
 
 
 
 
 
 
 
 
 
 
35
 
36
- gr.Markdown(html.DESCRIPTION)
37
  website_url = gr.Textbox('hackernews.com', label='Website')
38
  output = gr.Textbox(label='RSS entries', lines=10)
39
  submit_button = gr.Button('Submit')
 
1
  '''Main script to run gradio interface and MCP server.'''
2
 
3
  import logging
4
+ from functools import partial
5
  from pathlib import Path
6
  from logging.handlers import RotatingFileHandler
7
 
8
  import gradio as gr
9
  import assets.html as html
10
+ import functions.tools as tool_funcs
11
+ import functions.gradio_functions as gradio_funcs
12
 
13
+
14
+ # Set-up logging
15
  # Make sure log directory exists
16
  Path('logs').mkdir(parents=True, exist_ok=True)
17
 
18
+ # Clear old logs if present
19
+ gradio_funcs.delete_old_logs('logs', 'rss_server')
20
 
21
+ # Set up the root logger so we catch logs from everything
22
  logging.basicConfig(
23
  handlers=[RotatingFileHandler(
24
  'logs/rss_server.log',
 
30
  format='%(levelname)s - %(name)s - %(message)s'
31
  )
32
 
33
+ # Get a logger
34
  logger = logging.getLogger(__name__)
35
 
 
36
  with gr.Blocks() as demo:
37
 
38
+ # Page text
39
+ gr.HTML(html.TITLE)
40
+ gr.HTML(html.DESCRIPTION)
41
+
42
+ # Log output
43
+ dialog_output = gr.Textbox(label='Server logs', lines=10, max_lines=100)
44
+ timer = gr.Timer(0.5, active=True)
45
+
46
+ timer.tick( # pylint: disable=no-member
47
+ lambda: gradio_funcs.update_log(), # pylint: disable=unnecessary-lambda
48
+ outputs=dialog_output
49
+ )
50
 
51
+ # Get feed tool
52
  website_url = gr.Textbox('hackernews.com', label='Website')
53
  output = gr.Textbox(label='RSS entries', lines=10)
54
  submit_button = gr.Button('Submit')