Merge pull request #18 from gperdrizet/dev
Browse files- assets/html.py +0 -33
- assets/text.py +32 -0
- functions/feed_extraction.py +22 -19
- functions/gradio_functions.py +49 -0
- functions/rag.py +52 -0
- functions/summarization.py +3 -1
- functions/tools.py +169 -9
- requirements.txt +4 -1
- rss_server.py +129 -12
assets/html.py
DELETED
@@ -1,33 +0,0 @@
|
|
1 |
-
'''HTML elements for Gradio interface.'''
|
2 |
-
|
3 |
-
TITLE = (
|
4 |
-
'''
|
5 |
-
<center>
|
6 |
-
<h1>RSS feed reader</h1>
|
7 |
-
</center>
|
8 |
-
'''
|
9 |
-
)
|
10 |
-
|
11 |
-
DESCRIPTION = (
|
12 |
-
'''
|
13 |
-
<p>RSS feed reader MCP server. See
|
14 |
-
<a href="https://huggingface.co/spaces/Agents-MCP-Hackathon/rss-mcp-client">
|
15 |
-
Agentic RSS reader</a>for a demonstration. Check out the
|
16 |
-
<a href="https://github.com/gperdrizet/MCP-hackathon/tree/main">
|
17 |
-
main project repo on GitHub</a>. Both Spaces by
|
18 |
-
<a href="https://www.linkedin.com/in/gperdrizet">George Perdrizet</a>.</p>
|
19 |
-
|
20 |
-
<p>This Space is not meant to be used directly, but you can try out the bare tool below.
|
21 |
-
Enter a website name, website URL, or feed URI. The tool will do it's best
|
22 |
-
to find the feed and return titles, links and summaries for the three most recent posts.
|
23 |
-
Suggestions: http://openai.com/news/rss.xml, hackernews.com, slashdot, etc.</p>
|
24 |
-
|
25 |
-
<h2>Tools</h2>
|
26 |
-
|
27 |
-
<ol>
|
28 |
-
<li><b>DONE</b> Given a website name or URL, find its RSS feed and return recent
|
29 |
-
article titles, links and a generated summary of content if avalible</li>
|
30 |
-
<li><b>TODO</b> Simple RAG on requested RSS feed content</li>
|
31 |
-
</ol>
|
32 |
-
'''
|
33 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assets/text.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''HTML elements for Gradio interface.'''
|
2 |
+
|
3 |
+
TITLE = ('''
|
4 |
+
<center>
|
5 |
+
<h1>RSS feed reader</h1>
|
6 |
+
</center>
|
7 |
+
''')
|
8 |
+
|
9 |
+
DESCRIPTION = ('''
|
10 |
+
RSS feed reader MCP server. See
|
11 |
+
[Agentic RSS reader](https://huggingface.co/spaces/Agents-MCP-Hackathon/rss-mcp-client)
|
12 |
+
for a demonstration. Check out the
|
13 |
+
[main project repo on GitHub](https://github.com/gperdrizet/MCP-hackathon/tree/main)
|
14 |
+
. Both Spaces by
|
15 |
+
[George Perdrizet](https://www.linkedin.com/in/gperdrizet)
|
16 |
+
|
17 |
+
This space is not meant to be used directly. It exposes a set of tools to
|
18 |
+
interact with RSS feeds for use by agents. For testing and demonstration,
|
19 |
+
you can try the tools directly below.
|
20 |
+
|
21 |
+
## Tools
|
22 |
+
|
23 |
+
1. `get_feed()`: Given a website name or URL, find its RSS feed and
|
24 |
+
return recent article titles, links and a generated summary of content if
|
25 |
+
avalible. Caches results for fast retrieval by other tools. Embeds content
|
26 |
+
to vector database for subsequent RAG.
|
27 |
+
2. `context_search()`: Vector search on article content for RAG context.
|
28 |
+
3. `find_article()`: Uses vector search on article content to find title of article
|
29 |
+
that user is referring to.
|
30 |
+
4. `get_summary()`: Gets article summary from Redis cache using article title.
|
31 |
+
5. `get_link()`: Gets article link from Redis cache using article title.
|
32 |
+
''')
|
functions/feed_extraction.py
CHANGED
@@ -2,7 +2,6 @@
|
|
2 |
|
3 |
import os
|
4 |
import re
|
5 |
-
import json
|
6 |
import logging
|
7 |
import urllib.request
|
8 |
from urllib.error import HTTPError, URLError
|
@@ -53,8 +52,8 @@ def find_feed_uri(website: str) -> str:
|
|
53 |
feed_uri = FEED_URIS[website]
|
54 |
logger.info('%s feed URI in local cache: %s', website, feed_uri)
|
55 |
|
56 |
-
#
|
57 |
-
cache_key = f
|
58 |
cache_hit = False
|
59 |
|
60 |
if feed_uri is None:
|
@@ -65,7 +64,7 @@ def find_feed_uri(website: str) -> str:
|
|
65 |
feed_uri = cached_uri
|
66 |
logger.info('%s feed URI in Redis cache: %s', website, feed_uri)
|
67 |
|
68 |
-
# If none of those get it - try feedparse if it looks like a url
|
69 |
# or else just google it
|
70 |
if feed_uri is None:
|
71 |
if website.split('.')[-1] in COMMON_EXTENSIONS:
|
@@ -79,6 +78,7 @@ def find_feed_uri(website: str) -> str:
|
|
79 |
feed_uri = _get_feed(website_url)
|
80 |
logger.info('get_feed() returned %s', feed_uri)
|
81 |
|
|
|
82 |
FEED_URIS[website] = feed_uri
|
83 |
|
84 |
# Add the feed URI to the redis cache if it wasn't already there
|
@@ -88,14 +88,16 @@ def find_feed_uri(website: str) -> str:
|
|
88 |
return feed_uri
|
89 |
|
90 |
|
91 |
-
def parse_feed(feed_uri: str) -> list:
|
92 |
'''Gets content from a remote RSS feed URI.
|
93 |
|
94 |
Args:
|
95 |
feed_uri: The RSS feed to get content from
|
|
|
96 |
|
97 |
Returns:
|
98 |
-
List of
|
|
|
99 |
'''
|
100 |
|
101 |
logger = logging.getLogger(__name__ + '.parse_feed')
|
@@ -112,16 +114,15 @@ def parse_feed(feed_uri: str) -> list:
|
|
112 |
if 'title' in entry and 'link' in entry:
|
113 |
|
114 |
title = entry.title
|
|
|
115 |
|
116 |
-
# Check the Redis cache
|
117 |
-
|
118 |
-
cache_hit = False
|
119 |
-
cached_entry = REDIS.get(cache_key)
|
120 |
|
121 |
-
if
|
122 |
-
cache_hit = True
|
123 |
-
entry_content = json.loads(cached_entry)
|
124 |
logger.info('Entry in Redis cache: "%s"', title)
|
|
|
|
|
125 |
|
126 |
# If its not in the Redis cache, parse it from the feed data
|
127 |
else:
|
@@ -129,24 +130,26 @@ def parse_feed(feed_uri: str) -> list:
|
|
129 |
entry_content['link'] = entry.link
|
130 |
entry_content['content'] = None
|
131 |
|
|
|
132 |
if 'content' in entry:
|
133 |
entry_content['content'] = entry.content
|
134 |
|
135 |
-
|
|
|
136 |
|
137 |
html = _get_html(entry_content['link'])
|
138 |
content = _get_text(html)
|
139 |
entry_content['content'] = content
|
140 |
|
141 |
-
|
|
|
|
|
142 |
|
143 |
-
|
144 |
-
if cache_hit is False:
|
145 |
-
REDIS.set(cache_key, entry_content)
|
146 |
|
147 |
entries[i] = entry_content
|
148 |
|
149 |
-
if i ==
|
150 |
break
|
151 |
|
152 |
logger.info('Entries contains %s elements', len(list(entries.keys())))
|
|
|
2 |
|
3 |
import os
|
4 |
import re
|
|
|
5 |
import logging
|
6 |
import urllib.request
|
7 |
from urllib.error import HTTPError, URLError
|
|
|
52 |
feed_uri = FEED_URIS[website]
|
53 |
logger.info('%s feed URI in local cache: %s', website, feed_uri)
|
54 |
|
55 |
+
# If we still haven't found it, check to see if the URI is in the Redis cache
|
56 |
+
cache_key = f'{website} feed uri'
|
57 |
cache_hit = False
|
58 |
|
59 |
if feed_uri is None:
|
|
|
64 |
feed_uri = cached_uri
|
65 |
logger.info('%s feed URI in Redis cache: %s', website, feed_uri)
|
66 |
|
67 |
+
# If still none of those methods get it - try feedparse if it looks like a url
|
68 |
# or else just google it
|
69 |
if feed_uri is None:
|
70 |
if website.split('.')[-1] in COMMON_EXTENSIONS:
|
|
|
78 |
feed_uri = _get_feed(website_url)
|
79 |
logger.info('get_feed() returned %s', feed_uri)
|
80 |
|
81 |
+
# Add to local cache
|
82 |
FEED_URIS[website] = feed_uri
|
83 |
|
84 |
# Add the feed URI to the redis cache if it wasn't already there
|
|
|
88 |
return feed_uri
|
89 |
|
90 |
|
91 |
+
def parse_feed(feed_uri: str, n: int) -> list:
|
92 |
'''Gets content from a remote RSS feed URI.
|
93 |
|
94 |
Args:
|
95 |
feed_uri: The RSS feed to get content from
|
96 |
+
n: the number of feed entries to parse
|
97 |
|
98 |
Returns:
|
99 |
+
List of dictionaries for the n most recent entries in the RSS feed.
|
100 |
+
Each dictionary contains 'title', 'link' and 'content' keys.
|
101 |
'''
|
102 |
|
103 |
logger = logging.getLogger(__name__ + '.parse_feed')
|
|
|
114 |
if 'title' in entry and 'link' in entry:
|
115 |
|
116 |
title = entry.title
|
117 |
+
entry_content['title'] = title
|
118 |
|
119 |
+
# Check the Redis cache
|
120 |
+
cached_link = REDIS.get(f'{title} link')
|
|
|
|
|
121 |
|
122 |
+
if cached_link:
|
|
|
|
|
123 |
logger.info('Entry in Redis cache: "%s"', title)
|
124 |
+
entry_content['link'] = cached_link
|
125 |
+
entry_content['content'] = REDIS.get(f'{title} content')
|
126 |
|
127 |
# If its not in the Redis cache, parse it from the feed data
|
128 |
else:
|
|
|
130 |
entry_content['link'] = entry.link
|
131 |
entry_content['content'] = None
|
132 |
|
133 |
+
# Grab the article content from the feed, if provided
|
134 |
if 'content' in entry:
|
135 |
entry_content['content'] = entry.content
|
136 |
|
137 |
+
# If not, try to get the article content from the link
|
138 |
+
elif entry_content['content'] is None:
|
139 |
|
140 |
html = _get_html(entry_content['link'])
|
141 |
content = _get_text(html)
|
142 |
entry_content['content'] = content
|
143 |
|
144 |
+
# Add everything to the cache
|
145 |
+
REDIS.set(f'{title} link', entry_content['link'])
|
146 |
+
REDIS.set(f'{title} content', entry_content['content'])
|
147 |
|
148 |
+
logger.info('Parsed entry: "%s"', title)
|
|
|
|
|
149 |
|
150 |
entries[i] = entry_content
|
151 |
|
152 |
+
if i == n-1:
|
153 |
break
|
154 |
|
155 |
logger.info('Entries contains %s elements', len(list(entries.keys())))
|
functions/gradio_functions.py
CHANGED
@@ -2,6 +2,55 @@
|
|
2 |
|
3 |
import os
|
4 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
|
7 |
def update_log(n: int = 10):
|
|
|
2 |
|
3 |
import os
|
4 |
import re
|
5 |
+
import logging
|
6 |
+
|
7 |
+
from openai import OpenAI
|
8 |
+
|
9 |
+
def call_modal() -> None:
|
10 |
+
'''Sends request to Modal to spin up container'''
|
11 |
+
|
12 |
+
logger = logging.getLogger(__name__ + '.call_modal()')
|
13 |
+
|
14 |
+
# Call the modal container so it spins up
|
15 |
+
client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
|
16 |
+
|
17 |
+
client.base_url = (
|
18 |
+
'https://gperdrizet--vllm-openai-compatible-summarization-serve.modal.run/v1'
|
19 |
+
)
|
20 |
+
|
21 |
+
# Default to first avalible model
|
22 |
+
model = client.models.list().data[0]
|
23 |
+
model_id = model.id
|
24 |
+
|
25 |
+
messages = [
|
26 |
+
{
|
27 |
+
'role': 'system',
|
28 |
+
'content': ('Interpret the following proverb in 50 words or less: ' +
|
29 |
+
'A poor craftsman blames the eye of the beholder')
|
30 |
+
}
|
31 |
+
]
|
32 |
+
|
33 |
+
logger.info('Prompt: %s', messages[0]['content'])
|
34 |
+
|
35 |
+
completion_args = {
|
36 |
+
'model': model_id,
|
37 |
+
'messages': messages,
|
38 |
+
}
|
39 |
+
|
40 |
+
try:
|
41 |
+
response = client.chat.completions.create(**completion_args)
|
42 |
+
|
43 |
+
except Exception as e: # pylint: disable=broad-exception-caught
|
44 |
+
response = None
|
45 |
+
logger.error('Error during Modal API call: %s', e)
|
46 |
+
|
47 |
+
if response is not None:
|
48 |
+
reply = response.choices[0].message.content
|
49 |
+
|
50 |
+
else:
|
51 |
+
reply = None
|
52 |
+
|
53 |
+
logger.info('Reply: %s', reply)
|
54 |
|
55 |
|
56 |
def update_log(n: int = 10):
|
functions/rag.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''Collection of function for RAG on article texts.'''
|
2 |
+
|
3 |
+
import os
|
4 |
+
import logging
|
5 |
+
import queue
|
6 |
+
from semantic_text_splitter import TextSplitter
|
7 |
+
from tokenizers import Tokenizer
|
8 |
+
from upstash_vector import Index
|
9 |
+
|
10 |
+
|
11 |
+
def ingest(rag_ingest_queue: queue.Queue) -> None:
|
12 |
+
'''Semantically chunks article and upsert to Upstash vector db
|
13 |
+
using article title as namespace.'''
|
14 |
+
|
15 |
+
logger = logging.getLogger(__name__ + '.ingest()')
|
16 |
+
|
17 |
+
index = Index(
|
18 |
+
url='https://living-whale-89944-us1-vector.upstash.io',
|
19 |
+
token=os.environ['UPSTASH_VECTOR_KEY']
|
20 |
+
)
|
21 |
+
|
22 |
+
while True:
|
23 |
+
|
24 |
+
namespaces = index.list_namespaces()
|
25 |
+
|
26 |
+
item = rag_ingest_queue.get()
|
27 |
+
logger.info('Upserting "%s": %s', item['title'], item)
|
28 |
+
title = item['title']
|
29 |
+
|
30 |
+
if title not in namespaces:
|
31 |
+
text = item['content']
|
32 |
+
logger.info('Got "%s" from RAG ingest queue', title)
|
33 |
+
|
34 |
+
tokenizer=Tokenizer.from_pretrained('bert-base-uncased')
|
35 |
+
splitter=TextSplitter.from_huggingface_tokenizer(tokenizer, 256)
|
36 |
+
chunks=splitter.chunks(text)
|
37 |
+
|
38 |
+
for i, chunk in enumerate(chunks):
|
39 |
+
|
40 |
+
index.upsert(
|
41 |
+
[
|
42 |
+
(
|
43 |
+
hash(f'{title}-{i}'),
|
44 |
+
chunk,
|
45 |
+
{'namespace': title}
|
46 |
+
)
|
47 |
+
],
|
48 |
+
)
|
49 |
+
logger.info('Ingested %s chunks into vector DB', i + 1)
|
50 |
+
|
51 |
+
else:
|
52 |
+
logger.info('%s already in RAG namespace', title)
|
functions/summarization.py
CHANGED
@@ -25,7 +25,7 @@ def summarize_content(title: str, content: str) -> str:
|
|
25 |
logger.info('Summarizing extracted content')
|
26 |
|
27 |
# Check Redis cache for summary
|
28 |
-
cache_key = f
|
29 |
cached_summary = REDIS.get(cache_key)
|
30 |
|
31 |
if cached_summary:
|
@@ -77,6 +77,8 @@ def summarize_content(title: str, content: str) -> str:
|
|
77 |
else:
|
78 |
summary = None
|
79 |
|
|
|
80 |
REDIS.set(cache_key, summary)
|
81 |
logger.info('Summarized: "%s"', title)
|
|
|
82 |
return summary
|
|
|
25 |
logger.info('Summarizing extracted content')
|
26 |
|
27 |
# Check Redis cache for summary
|
28 |
+
cache_key = f'{title} summary'
|
29 |
cached_summary = REDIS.get(cache_key)
|
30 |
|
31 |
if cached_summary:
|
|
|
77 |
else:
|
78 |
summary = None
|
79 |
|
80 |
+
# Add the new summary to the cache
|
81 |
REDIS.set(cache_key, summary)
|
82 |
logger.info('Summarized: "%s"', title)
|
83 |
+
|
84 |
return summary
|
functions/tools.py
CHANGED
@@ -1,20 +1,40 @@
|
|
1 |
'''Tool functions for MCP server'''
|
2 |
|
|
|
|
|
3 |
import time
|
4 |
import json
|
5 |
import logging
|
|
|
|
|
|
|
|
|
|
|
6 |
import functions.feed_extraction as extraction_funcs
|
7 |
import functions.summarization as summarization_funcs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
|
10 |
-
def get_feed(website: str) -> list:
|
11 |
'''Gets RSS feed content from a given website. Can take a website or RSS
|
12 |
feed URL directly, or the name of a website. Will attempt to find RSS
|
13 |
feed and return title, summary and link to full article for most recent
|
14 |
-
items in feed.
|
|
|
|
|
15 |
|
16 |
Args:
|
17 |
website: URL or name of website to extract RSS feed content from
|
|
|
18 |
|
19 |
Returns:
|
20 |
JSON string containing the feed content or 'No feed found' if a RSS
|
@@ -35,21 +55,161 @@ def get_feed(website: str) -> list:
|
|
35 |
return 'No feed found'
|
36 |
|
37 |
# Parse and extract content from the feed
|
38 |
-
|
39 |
-
logger.info('parse_feed() returned %s entries', len(list(
|
40 |
|
41 |
-
#
|
42 |
-
for i, item in
|
43 |
|
|
|
44 |
if item['content'] is not None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
summary = summarization_funcs.summarize_content(
|
46 |
item['title'],
|
47 |
item['content']
|
48 |
)
|
49 |
-
content[i]['summary'] = summary
|
50 |
|
51 |
-
|
|
|
|
|
|
|
|
|
52 |
|
53 |
logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
|
54 |
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
'''Tool functions for MCP server'''
|
2 |
|
3 |
+
import os
|
4 |
+
import threading
|
5 |
import time
|
6 |
import json
|
7 |
import logging
|
8 |
+
import queue
|
9 |
+
from typing import Tuple
|
10 |
+
from upstash_vector import Index
|
11 |
+
from upstash_redis import Redis
|
12 |
+
|
13 |
import functions.feed_extraction as extraction_funcs
|
14 |
import functions.summarization as summarization_funcs
|
15 |
+
import functions.rag as rag_funcs
|
16 |
+
|
17 |
+
RAG_INGEST_QUEUE = queue.Queue()
|
18 |
+
|
19 |
+
rag_ingest_thread = threading.Thread(
|
20 |
+
target=rag_funcs.ingest,
|
21 |
+
args=(RAG_INGEST_QUEUE,)
|
22 |
+
)
|
23 |
+
|
24 |
+
rag_ingest_thread.start()
|
25 |
|
26 |
|
27 |
+
def get_feed(website: str, n: int = 3) -> list:
|
28 |
'''Gets RSS feed content from a given website. Can take a website or RSS
|
29 |
feed URL directly, or the name of a website. Will attempt to find RSS
|
30 |
feed and return title, summary and link to full article for most recent
|
31 |
+
n items in feed. This function is slow and resource heavy, only call it when
|
32 |
+
the user wants to check a feed for new content, or asks for content from a
|
33 |
+
feed that you have not retrieved yet.
|
34 |
|
35 |
Args:
|
36 |
website: URL or name of website to extract RSS feed content from
|
37 |
+
n: (optional) number of articles to parse from feed, defaults to 3
|
38 |
|
39 |
Returns:
|
40 |
JSON string containing the feed content or 'No feed found' if a RSS
|
|
|
55 |
return 'No feed found'
|
56 |
|
57 |
# Parse and extract content from the feed
|
58 |
+
articles = extraction_funcs.parse_feed(feed_uri, n)
|
59 |
+
logger.info('parse_feed() returned %s entries', len(list(articles.keys())))
|
60 |
|
61 |
+
# Loop on the posts, sending them to RAG (nonblocking) and summarization (blocking)
|
62 |
+
for i, item in articles.items():
|
63 |
|
64 |
+
# Check if content is present
|
65 |
if item['content'] is not None:
|
66 |
+
logger.info('Summarizing/RAG ingesting: %s', item)
|
67 |
+
|
68 |
+
# Send to RAG ingest
|
69 |
+
RAG_INGEST_QUEUE.put(item.copy())
|
70 |
+
logger.info('"%s" sent to RAG ingest', item['title'])
|
71 |
+
|
72 |
+
# Generate summary and add to content
|
73 |
summary = summarization_funcs.summarize_content(
|
74 |
item['title'],
|
75 |
item['content']
|
76 |
)
|
|
|
77 |
|
78 |
+
articles[i]['summary'] = summary
|
79 |
+
logger.info('Summary of "%s" generated', item['title'])
|
80 |
+
|
81 |
+
# Remove full-text content before returning
|
82 |
+
articles[i].pop('content', None)
|
83 |
|
84 |
logger.info('Completed in %s seconds', round(time.time()-start_time, 2))
|
85 |
|
86 |
+
# Return content dictionary as string
|
87 |
+
return json.dumps(articles)
|
88 |
+
|
89 |
+
|
90 |
+
def context_search(query: str, article_title: str = None) -> list[Tuple[float, str]]:
|
91 |
+
'''Searches for context relevant to query. Use this Function to search
|
92 |
+
for additional general information if needed before answering the user's question
|
93 |
+
about an article. If article_title is provided the search will only return
|
94 |
+
results from that article. If article_title is omitted, the search will
|
95 |
+
include all articles currently in the cache.
|
96 |
+
|
97 |
+
Ags:
|
98 |
+
query: user query to find context for
|
99 |
+
article_title: optional, use this argument to search only for
|
100 |
+
context from a specific article, defaults to None
|
101 |
+
|
102 |
+
Returns:
|
103 |
+
Text relevant to the query
|
104 |
+
'''
|
105 |
+
|
106 |
+
logger = logging.getLogger(__name__ + 'context_search')
|
107 |
+
|
108 |
+
index = Index(
|
109 |
+
url='https://living-whale-89944-us1-vector.upstash.io',
|
110 |
+
token=os.environ['UPSTASH_VECTOR_KEY']
|
111 |
+
)
|
112 |
+
|
113 |
+
results = None
|
114 |
+
|
115 |
+
results = index.query(
|
116 |
+
data=query,
|
117 |
+
top_k=3,
|
118 |
+
include_data=True,
|
119 |
+
namespace=article_title
|
120 |
+
)
|
121 |
+
|
122 |
+
logger.info('Retrieved %s chunks for "%s"', len(results), query)
|
123 |
+
|
124 |
+
return results[0].data
|
125 |
+
|
126 |
+
|
127 |
+
def find_article(query: str) -> list[Tuple[float, str]]:
|
128 |
+
'''Uses vector search to find the most likely title of the article
|
129 |
+
referred to by query. Use this function if the user is asking about
|
130 |
+
an article, but it is not clear what the exact title of the article is.
|
131 |
+
|
132 |
+
Args:
|
133 |
+
query: query to to find source article tile for
|
134 |
+
|
135 |
+
Returns:
|
136 |
+
Article title
|
137 |
+
'''
|
138 |
+
|
139 |
+
logger = logging.getLogger(__name__ + 'context_search')
|
140 |
+
|
141 |
+
index = Index(
|
142 |
+
url='https://living-whale-89944-us1-vector.upstash.io',
|
143 |
+
token=os.environ['UPSTASH_VECTOR_KEY']
|
144 |
+
)
|
145 |
+
|
146 |
+
results = None
|
147 |
+
|
148 |
+
results = index.query(
|
149 |
+
data=query,
|
150 |
+
top_k=3,
|
151 |
+
include_metadata=True,
|
152 |
+
include_data=True
|
153 |
+
)
|
154 |
+
|
155 |
+
logger.info('Retrieved %s chunks for "%s"', len(results), query)
|
156 |
+
|
157 |
+
return results[0].metadata['namespace']
|
158 |
+
|
159 |
+
|
160 |
+
def get_summary(title: str) -> str:
|
161 |
+
'''Uses article title to retrieve summary of article content.
|
162 |
+
|
163 |
+
Args:
|
164 |
+
title: exact title of article
|
165 |
+
|
166 |
+
Returns:
|
167 |
+
Short summary of article content.
|
168 |
+
'''
|
169 |
+
|
170 |
+
logger = logging.getLogger(__name__ + '.get_summary()')
|
171 |
+
|
172 |
+
redis = Redis(
|
173 |
+
url='https://sensible-midge-19304.upstash.io',
|
174 |
+
token=os.environ['UPSTASH_REDIS_KEY']
|
175 |
+
)
|
176 |
+
|
177 |
+
cache_key = f'{title} summary'
|
178 |
+
summary = redis.get(cache_key)
|
179 |
+
|
180 |
+
if summary:
|
181 |
+
|
182 |
+
logger.info('Got summary for "%s": %s', title, summary[:100])
|
183 |
+
return summary
|
184 |
+
|
185 |
+
logger.info('Could not find summary for: "%s"', title)
|
186 |
+
return f'No article called "{title}". Make sure you have the correct title.'
|
187 |
+
|
188 |
+
|
189 |
+
def get_link(title: str) -> str:
|
190 |
+
'''Uses article title to look up direct link to article content webpage.
|
191 |
+
|
192 |
+
Args:
|
193 |
+
title: exact title of article
|
194 |
+
|
195 |
+
Returns:
|
196 |
+
Article webpage URL.
|
197 |
+
'''
|
198 |
+
|
199 |
+
logger = logging.getLogger(__name__ + '.get_link()')
|
200 |
+
|
201 |
+
redis = Redis(
|
202 |
+
url='https://sensible-midge-19304.upstash.io',
|
203 |
+
token=os.environ['UPSTASH_REDIS_KEY']
|
204 |
+
)
|
205 |
+
|
206 |
+
cache_key = f'{title} link'
|
207 |
+
link = redis.get(cache_key)
|
208 |
+
|
209 |
+
if link:
|
210 |
+
|
211 |
+
logger.info('Got link for "%s": %s', title, link)
|
212 |
+
return link
|
213 |
+
|
214 |
+
logger.info('Could not find link for: "%s"', title)
|
215 |
+
return f'No article called "{title}". Make sure you have the correct title.'
|
requirements.txt
CHANGED
@@ -5,4 +5,7 @@ googlesearch-python
|
|
5 |
gradio
|
6 |
mcp
|
7 |
openai
|
8 |
-
|
|
|
|
|
|
|
|
5 |
gradio
|
6 |
mcp
|
7 |
openai
|
8 |
+
semantic-text-splitter
|
9 |
+
tokenizers
|
10 |
+
upstash-redis
|
11 |
+
upstash-vector
|
rss_server.py
CHANGED
@@ -1,18 +1,19 @@
|
|
1 |
'''Main script to run gradio interface and MCP server.'''
|
2 |
|
3 |
import logging
|
4 |
-
from functools import partial
|
5 |
from pathlib import Path
|
6 |
from logging.handlers import RotatingFileHandler
|
7 |
|
8 |
import gradio as gr
|
9 |
-
import assets.
|
10 |
import functions.tools as tool_funcs
|
11 |
import functions.gradio_functions as gradio_funcs
|
12 |
|
|
|
|
|
|
|
13 |
|
14 |
-
# Set-up logging
|
15 |
-
# Make sure log directory exists
|
16 |
Path('logs').mkdir(parents=True, exist_ok=True)
|
17 |
|
18 |
# Clear old logs if present
|
@@ -36,11 +37,14 @@ logger = logging.getLogger(__name__)
|
|
36 |
with gr.Blocks() as demo:
|
37 |
|
38 |
# Page text
|
39 |
-
gr.HTML(
|
40 |
-
gr.
|
|
|
41 |
|
42 |
# Log output
|
43 |
-
|
|
|
|
|
44 |
timer = gr.Timer(0.5, active=True)
|
45 |
|
46 |
timer.tick( # pylint: disable=no-member
|
@@ -49,19 +53,132 @@ with gr.Blocks() as demo:
|
|
49 |
show_api=False
|
50 |
)
|
51 |
|
|
|
52 |
# Get feed tool
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
56 |
|
57 |
-
|
58 |
fn=tool_funcs.get_feed,
|
59 |
inputs=website_url,
|
60 |
-
outputs=
|
61 |
api_name='Get RSS feed content'
|
62 |
)
|
63 |
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
if __name__ == '__main__':
|
66 |
|
67 |
demo.launch(mcp_server=True)
|
|
|
1 |
'''Main script to run gradio interface and MCP server.'''
|
2 |
|
3 |
import logging
|
|
|
4 |
from pathlib import Path
|
5 |
from logging.handlers import RotatingFileHandler
|
6 |
|
7 |
import gradio as gr
|
8 |
+
import assets.text as text
|
9 |
import functions.tools as tool_funcs
|
10 |
import functions.gradio_functions as gradio_funcs
|
11 |
|
12 |
+
# Call the modal container so it spins up before the rest of
|
13 |
+
# the app starts
|
14 |
+
gradio_funcs.call_modal()
|
15 |
|
16 |
+
# Set-up logging - make sure log directory exists
|
|
|
17 |
Path('logs').mkdir(parents=True, exist_ok=True)
|
18 |
|
19 |
# Clear old logs if present
|
|
|
37 |
with gr.Blocks() as demo:
|
38 |
|
39 |
# Page text
|
40 |
+
gr.HTML(text.TITLE)
|
41 |
+
gr.Markdown(text.DESCRIPTION)
|
42 |
+
|
43 |
|
44 |
# Log output
|
45 |
+
with gr.Row():
|
46 |
+
dialog_output = gr.Textbox(label='Server logs', lines=7, max_lines=5)
|
47 |
+
|
48 |
timer = gr.Timer(0.5, active=True)
|
49 |
|
50 |
timer.tick( # pylint: disable=no-member
|
|
|
53 |
show_api=False
|
54 |
)
|
55 |
|
56 |
+
|
57 |
# Get feed tool
|
58 |
+
gr.Markdown('### 1. `get_feed()`')
|
59 |
+
website_url = gr.Textbox('slashdot', label='Website')
|
60 |
+
feed_output = gr.Textbox(label='RSS entries', lines=7, max_lines=7)
|
61 |
+
|
62 |
+
with gr.Row():
|
63 |
+
website_submit_button = gr.Button('Submit website')
|
64 |
+
website_clear_button = gr.ClearButton(components=[website_url, feed_output])
|
65 |
|
66 |
+
website_submit_button.click( # pylint: disable=no-member
|
67 |
fn=tool_funcs.get_feed,
|
68 |
inputs=website_url,
|
69 |
+
outputs=feed_output,
|
70 |
api_name='Get RSS feed content'
|
71 |
)
|
72 |
|
73 |
|
74 |
+
# Vector search tool
|
75 |
+
gr.Markdown('### 2. `context_search()`')
|
76 |
+
|
77 |
+
context_search_query = gr.Textbox(
|
78 |
+
'How is the air traffic control system being updated?',
|
79 |
+
label='Context search query'
|
80 |
+
)
|
81 |
+
context_search_output = gr.Textbox(
|
82 |
+
label='Context search results',
|
83 |
+
lines=7,
|
84 |
+
max_lines=7
|
85 |
+
)
|
86 |
+
|
87 |
+
with gr.Row():
|
88 |
+
context_search_submit_button = gr.Button('Submit query')
|
89 |
+
context_search_clear_button = gr.ClearButton(
|
90 |
+
components=[context_search_query, context_search_output]
|
91 |
+
)
|
92 |
+
|
93 |
+
context_search_submit_button.click( # pylint: disable=no-member
|
94 |
+
fn=tool_funcs.context_search,
|
95 |
+
inputs=context_search_query,
|
96 |
+
outputs=context_search_output,
|
97 |
+
api_name='Context vector search'
|
98 |
+
)
|
99 |
+
|
100 |
+
|
101 |
+
# Find article tool
|
102 |
+
gr.Markdown('### 3. `find_article()`')
|
103 |
+
|
104 |
+
article_search_query = gr.Textbox(
|
105 |
+
'How is the air traffic control system being updated?',
|
106 |
+
label='Article search query'
|
107 |
+
)
|
108 |
+
article_search_output = gr.Textbox(
|
109 |
+
label='Article search results',
|
110 |
+
lines=3,
|
111 |
+
max_lines=3
|
112 |
+
)
|
113 |
+
|
114 |
+
with gr.Row():
|
115 |
+
article_search_submit_button = gr.Button('Submit query')
|
116 |
+
article_search_clear_button = gr.ClearButton(
|
117 |
+
components=[article_search_query, article_search_output]
|
118 |
+
)
|
119 |
+
|
120 |
+
article_search_submit_button.click( # pylint: disable=no-member
|
121 |
+
fn=tool_funcs.find_article,
|
122 |
+
inputs=article_search_query,
|
123 |
+
outputs=article_search_output,
|
124 |
+
api_name='Article vector search'
|
125 |
+
)
|
126 |
+
|
127 |
+
|
128 |
+
# Get summary tool
|
129 |
+
gr.Markdown('### 4. `get_summary()`')
|
130 |
+
|
131 |
+
article_title = gr.Textbox(
|
132 |
+
'FAA To Eliminate Floppy Disks Used In Air Traffic Control Systems',
|
133 |
+
label='Article title'
|
134 |
+
)
|
135 |
+
article_summary = gr.Textbox(
|
136 |
+
label='Article summary',
|
137 |
+
lines=3,
|
138 |
+
max_lines=3
|
139 |
+
)
|
140 |
+
|
141 |
+
with gr.Row():
|
142 |
+
article_title_submit_button = gr.Button('Submit title')
|
143 |
+
article_title_clear_button = gr.ClearButton(
|
144 |
+
components=[article_title, article_summary]
|
145 |
+
)
|
146 |
+
|
147 |
+
article_title_submit_button.click( # pylint: disable=no-member
|
148 |
+
fn=tool_funcs.get_summary,
|
149 |
+
inputs=article_title,
|
150 |
+
outputs=article_summary,
|
151 |
+
api_name='Article summary search'
|
152 |
+
)
|
153 |
+
|
154 |
+
|
155 |
+
# Get link tool
|
156 |
+
gr.Markdown('### 5. `get_link()`')
|
157 |
+
|
158 |
+
article_title_link = gr.Textbox(
|
159 |
+
'FAA To Eliminate Floppy Disks Used In Air Traffic Control Systems',
|
160 |
+
label='Article title'
|
161 |
+
)
|
162 |
+
article_link = gr.Textbox(
|
163 |
+
label='Article link',
|
164 |
+
lines=3,
|
165 |
+
max_lines=3
|
166 |
+
)
|
167 |
+
|
168 |
+
with gr.Row():
|
169 |
+
article_link_submit_button = gr.Button('Submit title')
|
170 |
+
article_link_clear_button = gr.ClearButton(
|
171 |
+
components=[article_title_link, article_link]
|
172 |
+
)
|
173 |
+
|
174 |
+
article_link_submit_button.click( # pylint: disable=no-member
|
175 |
+
fn=tool_funcs.get_link,
|
176 |
+
inputs=article_title_link,
|
177 |
+
outputs=article_link,
|
178 |
+
api_name='Article link search'
|
179 |
+
)
|
180 |
+
|
181 |
+
|
182 |
if __name__ == '__main__':
|
183 |
|
184 |
demo.launch(mcp_server=True)
|