Merge pull request #10 from gperdrizet/dev
Browse files- assets/html.py +10 -9
- functions/{helper_functions.py → feed_extraction.py} +67 -63
- functions/summarization.py +74 -0
- functions/tools.py +20 -8
- requirements.txt +2 -1
- rss_server.py +1 -1
assets/html.py
CHANGED
@@ -3,24 +3,25 @@
|
|
3 |
TITLE = (
|
4 |
'''
|
5 |
<center>
|
6 |
-
<h1>RSS feed
|
7 |
</center>
|
8 |
'''
|
9 |
)
|
10 |
|
11 |
DESCRIPTION = (
|
12 |
'''
|
13 |
-
Functions to find and
|
14 |
-
|
|
|
15 |
|
16 |
<ol>
|
17 |
-
<li>
|
18 |
-
|
|
|
19 |
</ol>
|
20 |
|
21 |
-
For now we
|
22 |
-
|
23 |
-
|
24 |
-
hackernews.com, Hugging Face, etc
|
25 |
'''
|
26 |
)
|
|
|
3 |
TITLE = (
|
4 |
'''
|
5 |
<center>
|
6 |
+
<h1>RSS feed reader</h1>
|
7 |
</center>
|
8 |
'''
|
9 |
)
|
10 |
|
11 |
DESCRIPTION = (
|
12 |
'''
|
13 |
+
Functions to find, extract and summarize RSS feeds are complete.
|
14 |
+
|
15 |
+
<h2>Tools</h2>
|
16 |
|
17 |
<ol>
|
18 |
+
<li><b>DONE</b> Given a website name or URL, find its RSS feed and return recent
|
19 |
+
article titles, links and a generated summary of content if avalible</li>
|
20 |
+
<li><b>TODO</b> Simple RAG on requested RSS feed content</li>
|
21 |
</ol>
|
22 |
|
23 |
+
For now we dump the extracted RSS title, link and summary below. Try asking for a
|
24 |
+
feed by website name, website URL, or entering your favorite feed URI directly.
|
25 |
+
Suggestions: http://openai.com/news/rss.xml, hackernews.com, Hugging Face, etc
|
|
|
26 |
'''
|
27 |
)
|
functions/{helper_functions.py → feed_extraction.py}
RENAMED
@@ -55,10 +55,10 @@ def find_feed_uri(website: str) -> str:
|
|
55 |
logger.info('%s looks like a website URL', website)
|
56 |
|
57 |
else:
|
58 |
-
website_url =
|
59 |
logger.info('Google result for %s: %s', website, website_url)
|
60 |
|
61 |
-
feed_uri =
|
62 |
logger.info('get_feed() returned %s', feed_uri)
|
63 |
|
64 |
FEED_URIS[website] = feed_uri
|
@@ -66,52 +66,6 @@ def find_feed_uri(website: str) -> str:
|
|
66 |
return feed_uri
|
67 |
|
68 |
|
69 |
-
def get_url(company_name: str) -> str:
|
70 |
-
'''Finds the website associated with the name of a company or
|
71 |
-
publication.
|
72 |
-
|
73 |
-
Args:
|
74 |
-
company_name: the name of the company, publication or site to find
|
75 |
-
the URL for
|
76 |
-
|
77 |
-
Returns:
|
78 |
-
The URL for the company, publication or website.
|
79 |
-
'''
|
80 |
-
|
81 |
-
logger = logging.getLogger(__name__ + '.get_url')
|
82 |
-
logger.info('Getting website URL for %s', company_name)
|
83 |
-
|
84 |
-
query = f'{company_name} official website'
|
85 |
-
|
86 |
-
for url in google_search(query, num_results=5):
|
87 |
-
if 'facebook' not in url and 'linkedin' not in url:
|
88 |
-
return url
|
89 |
-
|
90 |
-
return None
|
91 |
-
|
92 |
-
|
93 |
-
def get_feed(website_url: str) -> str:
|
94 |
-
'''Finds the RSS feed URI for a website given the website's url.
|
95 |
-
|
96 |
-
Args:
|
97 |
-
website_url: The url for the website to find the RSS feed for
|
98 |
-
|
99 |
-
Returns:
|
100 |
-
The website's RSS feed URI as a string
|
101 |
-
'''
|
102 |
-
|
103 |
-
logger = logging.getLogger(__name__ + '.get_content')
|
104 |
-
logger.info('Getting feed URI for: %s', website_url)
|
105 |
-
|
106 |
-
feeds = feed_search(website_url)
|
107 |
-
|
108 |
-
if len(feeds) > 0:
|
109 |
-
return str(feeds[0].url)
|
110 |
-
|
111 |
-
else:
|
112 |
-
return f'No feed found for {website_url}'
|
113 |
-
|
114 |
-
|
115 |
def parse_feed(feed_uri: str) -> list:
|
116 |
'''Gets content from a remote RSS feed URI.
|
117 |
|
@@ -138,28 +92,29 @@ def parse_feed(feed_uri: str) -> list:
|
|
138 |
entry_content['title'] = entry.title
|
139 |
entry_content['link'] = entry.link
|
140 |
|
141 |
-
entry_content['updated'] = None
|
142 |
-
entry_content['summary'] = None
|
143 |
entry_content['content'] = None
|
144 |
|
145 |
-
if 'updated' in entry:
|
146 |
-
|
147 |
|
148 |
-
if 'summary' in entry:
|
149 |
-
|
150 |
-
|
151 |
|
152 |
if 'content' in entry:
|
153 |
entry_content['content'] = entry.content
|
154 |
|
155 |
-
|
156 |
-
content = get_text(html)
|
157 |
|
158 |
-
|
|
|
|
|
159 |
|
160 |
entries[i] = entry_content
|
161 |
|
162 |
-
if i ==
|
163 |
break
|
164 |
|
165 |
logger.info('Entries contains %s elements', len(list(entries.keys())))
|
@@ -167,7 +122,53 @@ def parse_feed(feed_uri: str) -> list:
|
|
167 |
return entries
|
168 |
|
169 |
|
170 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
'''Gets HTML string content from url
|
172 |
|
173 |
Args:
|
@@ -221,7 +222,7 @@ def get_html(url: str) -> str:
|
|
221 |
return content
|
222 |
|
223 |
|
224 |
-
def
|
225 |
'''Uses boilerpy3 extractor and regex cribbed from old NLTK clean_html
|
226 |
function to try and extract text from HTML as cleanly as possible.
|
227 |
|
@@ -248,10 +249,10 @@ def get_text(html: str) -> str:
|
|
248 |
except TypeError:
|
249 |
pass
|
250 |
|
251 |
-
return
|
252 |
|
253 |
|
254 |
-
def
|
255 |
'''
|
256 |
Remove HTML markup from the given string.
|
257 |
|
@@ -262,6 +263,9 @@ def clean_html(html: str) -> str:
|
|
262 |
Cleaned string
|
263 |
'''
|
264 |
|
|
|
|
|
|
|
265 |
# First we remove inline JavaScript/CSS:
|
266 |
cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
|
267 |
|
|
|
55 |
logger.info('%s looks like a website URL', website)
|
56 |
|
57 |
else:
|
58 |
+
website_url = _get_url(website)
|
59 |
logger.info('Google result for %s: %s', website, website_url)
|
60 |
|
61 |
+
feed_uri = _get_feed(website_url)
|
62 |
logger.info('get_feed() returned %s', feed_uri)
|
63 |
|
64 |
FEED_URIS[website] = feed_uri
|
|
|
66 |
return feed_uri
|
67 |
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
def parse_feed(feed_uri: str) -> list:
|
70 |
'''Gets content from a remote RSS feed URI.
|
71 |
|
|
|
92 |
entry_content['title'] = entry.title
|
93 |
entry_content['link'] = entry.link
|
94 |
|
95 |
+
# entry_content['updated'] = None
|
96 |
+
# entry_content['summary'] = None
|
97 |
entry_content['content'] = None
|
98 |
|
99 |
+
# if 'updated' in entry:
|
100 |
+
# entry_content['updated'] = entry.updated
|
101 |
|
102 |
+
# if 'summary' in entry:
|
103 |
+
# summary = _get_text(entry.summary)
|
104 |
+
# entry_content['summary'] = summary
|
105 |
|
106 |
if 'content' in entry:
|
107 |
entry_content['content'] = entry.content
|
108 |
|
109 |
+
if entry_content['content'] is None:
|
|
|
110 |
|
111 |
+
html = _get_html(entry_content['link'])
|
112 |
+
content = _get_text(html)
|
113 |
+
entry_content['content'] = content
|
114 |
|
115 |
entries[i] = entry_content
|
116 |
|
117 |
+
if i == 2:
|
118 |
break
|
119 |
|
120 |
logger.info('Entries contains %s elements', len(list(entries.keys())))
|
|
|
122 |
return entries
|
123 |
|
124 |
|
125 |
+
def _get_url(company_name: str) -> str:
|
126 |
+
'''Finds the website associated with the name of a company or
|
127 |
+
publication.
|
128 |
+
|
129 |
+
Args:
|
130 |
+
company_name: the name of the company, publication or site to find
|
131 |
+
the URL for
|
132 |
+
|
133 |
+
Returns:
|
134 |
+
The URL for the company, publication or website.
|
135 |
+
'''
|
136 |
+
|
137 |
+
logger = logging.getLogger(__name__ + '.get_url')
|
138 |
+
logger.info('Getting website URL for %s', company_name)
|
139 |
+
|
140 |
+
query = f'{company_name} official website'
|
141 |
+
|
142 |
+
for url in google_search(query, num_results=5):
|
143 |
+
if 'facebook' not in url and 'linkedin' not in url:
|
144 |
+
return url
|
145 |
+
|
146 |
+
return None
|
147 |
+
|
148 |
+
|
149 |
+
def _get_feed(website_url: str) -> str:
|
150 |
+
'''Finds the RSS feed URI for a website given the website's url.
|
151 |
+
|
152 |
+
Args:
|
153 |
+
website_url: The url for the website to find the RSS feed for
|
154 |
+
|
155 |
+
Returns:
|
156 |
+
The website's RSS feed URI as a string
|
157 |
+
'''
|
158 |
+
|
159 |
+
logger = logging.getLogger(__name__ + '.get_content')
|
160 |
+
logger.info('Getting feed URI for: %s', website_url)
|
161 |
+
|
162 |
+
feeds = feed_search(website_url)
|
163 |
+
|
164 |
+
if len(feeds) > 0:
|
165 |
+
return str(feeds[0].url)
|
166 |
+
|
167 |
+
else:
|
168 |
+
return f'No feed found for {website_url}'
|
169 |
+
|
170 |
+
|
171 |
+
def _get_html(url: str) -> str:
|
172 |
'''Gets HTML string content from url
|
173 |
|
174 |
Args:
|
|
|
222 |
return content
|
223 |
|
224 |
|
225 |
+
def _get_text(html: str) -> str:
|
226 |
'''Uses boilerpy3 extractor and regex cribbed from old NLTK clean_html
|
227 |
function to try and extract text from HTML as cleanly as possible.
|
228 |
|
|
|
249 |
except TypeError:
|
250 |
pass
|
251 |
|
252 |
+
return _clean_html(html)
|
253 |
|
254 |
|
255 |
+
def _clean_html(html: str) -> str:
|
256 |
'''
|
257 |
Remove HTML markup from the given string.
|
258 |
|
|
|
263 |
Cleaned string
|
264 |
'''
|
265 |
|
266 |
+
if html is None:
|
267 |
+
return None
|
268 |
+
|
269 |
# First we remove inline JavaScript/CSS:
|
270 |
cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
|
271 |
|
functions/summarization.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''Functions to summarize article content.'''
|
2 |
+
|
3 |
+
import os
|
4 |
+
import logging
|
5 |
+
|
6 |
+
from openai import OpenAI
|
7 |
+
|
8 |
+
|
9 |
+
def summarize_content(content: str) -> str:
|
10 |
+
'''Generates summary of article content using Modal inference endpoint.
|
11 |
+
|
12 |
+
Args:
|
13 |
+
content: string containing the text content to be summarized
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
Summarized text as string
|
17 |
+
'''
|
18 |
+
|
19 |
+
logger = logging.getLogger(__name__ + '.summarize_content')
|
20 |
+
logger.info('Summarizing extracted content')
|
21 |
+
|
22 |
+
client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
|
23 |
+
|
24 |
+
client.base_url = (
|
25 |
+
'https://gperdrizet--vllm-openai-compatible-summarization-serve.modal.run/v1'
|
26 |
+
)
|
27 |
+
|
28 |
+
# Default to first avalible model
|
29 |
+
model = client.models.list().data[0]
|
30 |
+
model_id = model.id
|
31 |
+
|
32 |
+
# messages = [
|
33 |
+
# {
|
34 |
+
# 'role': 'system',
|
35 |
+
# 'content': ('You are a research assistant, skilled in summarizing documents in just '+
|
36 |
+
# 'a few sentences. Your document summaries should be a maximum of 2 to 4 sentences long.'),
|
37 |
+
# 'role': 'user',
|
38 |
+
# 'content': content
|
39 |
+
# }
|
40 |
+
# ]
|
41 |
+
|
42 |
+
messages = [
|
43 |
+
{
|
44 |
+
'role': 'system',
|
45 |
+
'content': f'Summarize the following text in 50 words returning only the summary: {content}'
|
46 |
+
}
|
47 |
+
]
|
48 |
+
|
49 |
+
completion_args = {
|
50 |
+
'model': model_id,
|
51 |
+
'messages': messages,
|
52 |
+
# "frequency_penalty": args.frequency_penalty,
|
53 |
+
# "max_tokens": 128,
|
54 |
+
# "n": args.n,
|
55 |
+
# "presence_penalty": args.presence_penalty,
|
56 |
+
# "seed": args.seed,
|
57 |
+
# "stop": args.stop,
|
58 |
+
# "stream": args.stream,
|
59 |
+
# "temperature": args.temperature,
|
60 |
+
# "top_p": args.top_p,
|
61 |
+
}
|
62 |
+
|
63 |
+
try:
|
64 |
+
response = client.chat.completions.create(**completion_args)
|
65 |
+
|
66 |
+
except Exception as e: # pylint: disable=broad-exception-caught
|
67 |
+
response = None
|
68 |
+
logger.error('Error during Modal API call: %s', e)
|
69 |
+
|
70 |
+
if response is not None:
|
71 |
+
return response.choices[0].message.content
|
72 |
+
|
73 |
+
else:
|
74 |
+
return None
|
functions/tools.py
CHANGED
@@ -2,30 +2,42 @@
|
|
2 |
|
3 |
import json
|
4 |
import logging
|
5 |
-
import functions.
|
|
|
6 |
|
7 |
|
8 |
-
def
|
9 |
-
'''Gets RSS feed content from a given website.
|
|
|
|
|
|
|
10 |
|
11 |
Args:
|
12 |
-
|
13 |
|
14 |
Returns:
|
15 |
-
|
16 |
-
requested website
|
17 |
'''
|
18 |
|
19 |
logger = logging.getLogger(__name__ + '.get_content')
|
20 |
logger.info('Getting feed content for: %s', website)
|
21 |
|
22 |
-
feed_uri =
|
23 |
logger.info('find_feed_uri() returned %s', feed_uri)
|
24 |
|
25 |
if 'No feed found' in feed_uri:
|
26 |
return 'No feed found'
|
27 |
|
28 |
-
content =
|
29 |
logger.info('parse_feed() returned %s entries', len(list(content.keys())))
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
return json.dumps(content)
|
|
|
2 |
|
3 |
import json
|
4 |
import logging
|
5 |
+
import functions.feed_extraction as extraction_funcs
|
6 |
+
import functions.summarization as summarization_funcs
|
7 |
|
8 |
|
9 |
+
def get_feed(website: str) -> list:
|
10 |
+
'''Gets RSS feed content from a given website. Can take a website or RSS
|
11 |
+
feed URL directly, or the name of a website. Will attempt to find RSS
|
12 |
+
feed and return title, summary and link to full article for most recent
|
13 |
+
items in feed
|
14 |
|
15 |
Args:
|
16 |
+
website: URL or name of website to extract RSS feed content from
|
17 |
|
18 |
Returns:
|
19 |
+
JSON string containing the feed content or 'No feed found' if a RSS
|
20 |
+
feed for the requested website could not be found
|
21 |
'''
|
22 |
|
23 |
logger = logging.getLogger(__name__ + '.get_content')
|
24 |
logger.info('Getting feed content for: %s', website)
|
25 |
|
26 |
+
feed_uri = extraction_funcs.find_feed_uri(website)
|
27 |
logger.info('find_feed_uri() returned %s', feed_uri)
|
28 |
|
29 |
if 'No feed found' in feed_uri:
|
30 |
return 'No feed found'
|
31 |
|
32 |
+
content = extraction_funcs.parse_feed(feed_uri)
|
33 |
logger.info('parse_feed() returned %s entries', len(list(content.keys())))
|
34 |
|
35 |
+
for i, item in content.items():
|
36 |
+
|
37 |
+
if item['content'] is not None:
|
38 |
+
summary = summarization_funcs.summarize_content(item['content'])
|
39 |
+
content[i]['summary'] = summary
|
40 |
+
|
41 |
+
content[i].pop('content', None)
|
42 |
+
|
43 |
return json.dumps(content)
|
requirements.txt
CHANGED
@@ -3,4 +3,5 @@ feedparser
|
|
3 |
findfeed
|
4 |
googlesearch-python
|
5 |
gradio
|
6 |
-
mcp
|
|
|
|
3 |
findfeed
|
4 |
googlesearch-python
|
5 |
gradio
|
6 |
+
mcp
|
7 |
+
openai
|
rss_server.py
CHANGED
@@ -39,7 +39,7 @@ with gr.Blocks() as demo:
|
|
39 |
submit_button = gr.Button('Submit')
|
40 |
|
41 |
submit_button.click( # pylint: disable=no-member
|
42 |
-
fn=tool_funcs.
|
43 |
inputs=website_url,
|
44 |
outputs=output,
|
45 |
api_name='Get RSS feed content'
|
|
|
39 |
submit_button = gr.Button('Submit')
|
40 |
|
41 |
submit_button.click( # pylint: disable=no-member
|
42 |
+
fn=tool_funcs.get_feed,
|
43 |
inputs=website_url,
|
44 |
outputs=output,
|
45 |
api_name='Get RSS feed content'
|