gperdrizet commited on
Commit
8c04b7a
·
unverified ·
2 Parent(s): aa0f12c 4d43357

Merge pull request #10 from gperdrizet/dev

Browse files
assets/html.py CHANGED
@@ -3,24 +3,25 @@
3
  TITLE = (
4
  '''
5
  <center>
6
- <h1>RSS feed finder/extractor</h1>
7
  </center>
8
  '''
9
  )
10
 
11
  DESCRIPTION = (
12
  '''
13
- Functions to find and extract RSS feeds are complete-ish. No AI
14
- yet, plan for tomorrow is to build two tools:
 
15
 
16
  <ol>
17
- <li>Human readable summaries of requested RSS feed</li>
18
- <li>Simple RAG on requested RSS feed content</li>
 
19
  </ol>
20
 
21
- For now we just dump the extracted RSS content below. Try asking
22
- for a feed by website name, website URL, or entering your favorite
23
- feed URI directly. Suggestions: http://openai.com/news/rss.xml,
24
- hackernews.com, Hugging Face, etc
25
  '''
26
  )
 
3
  TITLE = (
4
  '''
5
  <center>
6
+ <h1>RSS feed reader</h1>
7
  </center>
8
  '''
9
  )
10
 
11
  DESCRIPTION = (
12
  '''
13
+ Functions to find, extract and summarize RSS feeds are complete.
14
+
15
+ <h2>Tools</h2>
16
 
17
  <ol>
18
+ <li><b>DONE</b> Given a website name or URL, find its RSS feed and return recent
19
+ article titles, links and a generated summary of content if avalible</li>
20
+ <li><b>TODO</b> Simple RAG on requested RSS feed content</li>
21
  </ol>
22
 
23
+ For now we dump the extracted RSS title, link and summary below. Try asking for a
24
+ feed by website name, website URL, or entering your favorite feed URI directly.
25
+ Suggestions: http://openai.com/news/rss.xml, hackernews.com, Hugging Face, etc
 
26
  '''
27
  )
functions/{helper_functions.py → feed_extraction.py} RENAMED
@@ -55,10 +55,10 @@ def find_feed_uri(website: str) -> str:
55
  logger.info('%s looks like a website URL', website)
56
 
57
  else:
58
- website_url = get_url(website)
59
  logger.info('Google result for %s: %s', website, website_url)
60
 
61
- feed_uri = get_feed(website_url)
62
  logger.info('get_feed() returned %s', feed_uri)
63
 
64
  FEED_URIS[website] = feed_uri
@@ -66,52 +66,6 @@ def find_feed_uri(website: str) -> str:
66
  return feed_uri
67
 
68
 
69
- def get_url(company_name: str) -> str:
70
- '''Finds the website associated with the name of a company or
71
- publication.
72
-
73
- Args:
74
- company_name: the name of the company, publication or site to find
75
- the URL for
76
-
77
- Returns:
78
- The URL for the company, publication or website.
79
- '''
80
-
81
- logger = logging.getLogger(__name__ + '.get_url')
82
- logger.info('Getting website URL for %s', company_name)
83
-
84
- query = f'{company_name} official website'
85
-
86
- for url in google_search(query, num_results=5):
87
- if 'facebook' not in url and 'linkedin' not in url:
88
- return url
89
-
90
- return None
91
-
92
-
93
- def get_feed(website_url: str) -> str:
94
- '''Finds the RSS feed URI for a website given the website's url.
95
-
96
- Args:
97
- website_url: The url for the website to find the RSS feed for
98
-
99
- Returns:
100
- The website's RSS feed URI as a string
101
- '''
102
-
103
- logger = logging.getLogger(__name__ + '.get_content')
104
- logger.info('Getting feed URI for: %s', website_url)
105
-
106
- feeds = feed_search(website_url)
107
-
108
- if len(feeds) > 0:
109
- return str(feeds[0].url)
110
-
111
- else:
112
- return f'No feed found for {website_url}'
113
-
114
-
115
  def parse_feed(feed_uri: str) -> list:
116
  '''Gets content from a remote RSS feed URI.
117
 
@@ -138,28 +92,29 @@ def parse_feed(feed_uri: str) -> list:
138
  entry_content['title'] = entry.title
139
  entry_content['link'] = entry.link
140
 
141
- entry_content['updated'] = None
142
- entry_content['summary'] = None
143
  entry_content['content'] = None
144
 
145
- if 'updated' in entry:
146
- entry_content['updated'] = entry.updated
147
 
148
- if 'summary' in entry:
149
- summary = get_text(entry.summary)
150
- entry_content['summary'] = summary
151
 
152
  if 'content' in entry:
153
  entry_content['content'] = entry.content
154
 
155
- html = get_html(entry_content['link'])
156
- content = get_text(html)
157
 
158
- entry_content['extracted_content'] = content
 
 
159
 
160
  entries[i] = entry_content
161
 
162
- if i == 9:
163
  break
164
 
165
  logger.info('Entries contains %s elements', len(list(entries.keys())))
@@ -167,7 +122,53 @@ def parse_feed(feed_uri: str) -> list:
167
  return entries
168
 
169
 
170
- def get_html(url: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  '''Gets HTML string content from url
172
 
173
  Args:
@@ -221,7 +222,7 @@ def get_html(url: str) -> str:
221
  return content
222
 
223
 
224
- def get_text(html: str) -> str:
225
  '''Uses boilerpy3 extractor and regex cribbed from old NLTK clean_html
226
  function to try and extract text from HTML as cleanly as possible.
227
 
@@ -248,10 +249,10 @@ def get_text(html: str) -> str:
248
  except TypeError:
249
  pass
250
 
251
- return clean_html(html)
252
 
253
 
254
- def clean_html(html: str) -> str:
255
  '''
256
  Remove HTML markup from the given string.
257
 
@@ -262,6 +263,9 @@ def clean_html(html: str) -> str:
262
  Cleaned string
263
  '''
264
 
 
 
 
265
  # First we remove inline JavaScript/CSS:
266
  cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
267
 
 
55
  logger.info('%s looks like a website URL', website)
56
 
57
  else:
58
+ website_url = _get_url(website)
59
  logger.info('Google result for %s: %s', website, website_url)
60
 
61
+ feed_uri = _get_feed(website_url)
62
  logger.info('get_feed() returned %s', feed_uri)
63
 
64
  FEED_URIS[website] = feed_uri
 
66
  return feed_uri
67
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def parse_feed(feed_uri: str) -> list:
70
  '''Gets content from a remote RSS feed URI.
71
 
 
92
  entry_content['title'] = entry.title
93
  entry_content['link'] = entry.link
94
 
95
+ # entry_content['updated'] = None
96
+ # entry_content['summary'] = None
97
  entry_content['content'] = None
98
 
99
+ # if 'updated' in entry:
100
+ # entry_content['updated'] = entry.updated
101
 
102
+ # if 'summary' in entry:
103
+ # summary = _get_text(entry.summary)
104
+ # entry_content['summary'] = summary
105
 
106
  if 'content' in entry:
107
  entry_content['content'] = entry.content
108
 
109
+ if entry_content['content'] is None:
 
110
 
111
+ html = _get_html(entry_content['link'])
112
+ content = _get_text(html)
113
+ entry_content['content'] = content
114
 
115
  entries[i] = entry_content
116
 
117
+ if i == 2:
118
  break
119
 
120
  logger.info('Entries contains %s elements', len(list(entries.keys())))
 
122
  return entries
123
 
124
 
125
+ def _get_url(company_name: str) -> str:
126
+ '''Finds the website associated with the name of a company or
127
+ publication.
128
+
129
+ Args:
130
+ company_name: the name of the company, publication or site to find
131
+ the URL for
132
+
133
+ Returns:
134
+ The URL for the company, publication or website.
135
+ '''
136
+
137
+ logger = logging.getLogger(__name__ + '.get_url')
138
+ logger.info('Getting website URL for %s', company_name)
139
+
140
+ query = f'{company_name} official website'
141
+
142
+ for url in google_search(query, num_results=5):
143
+ if 'facebook' not in url and 'linkedin' not in url:
144
+ return url
145
+
146
+ return None
147
+
148
+
149
+ def _get_feed(website_url: str) -> str:
150
+ '''Finds the RSS feed URI for a website given the website's url.
151
+
152
+ Args:
153
+ website_url: The url for the website to find the RSS feed for
154
+
155
+ Returns:
156
+ The website's RSS feed URI as a string
157
+ '''
158
+
159
+ logger = logging.getLogger(__name__ + '.get_content')
160
+ logger.info('Getting feed URI for: %s', website_url)
161
+
162
+ feeds = feed_search(website_url)
163
+
164
+ if len(feeds) > 0:
165
+ return str(feeds[0].url)
166
+
167
+ else:
168
+ return f'No feed found for {website_url}'
169
+
170
+
171
+ def _get_html(url: str) -> str:
172
  '''Gets HTML string content from url
173
 
174
  Args:
 
222
  return content
223
 
224
 
225
+ def _get_text(html: str) -> str:
226
  '''Uses boilerpy3 extractor and regex cribbed from old NLTK clean_html
227
  function to try and extract text from HTML as cleanly as possible.
228
 
 
249
  except TypeError:
250
  pass
251
 
252
+ return _clean_html(html)
253
 
254
 
255
+ def _clean_html(html: str) -> str:
256
  '''
257
  Remove HTML markup from the given string.
258
 
 
263
  Cleaned string
264
  '''
265
 
266
+ if html is None:
267
+ return None
268
+
269
  # First we remove inline JavaScript/CSS:
270
  cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
271
 
functions/summarization.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''Functions to summarize article content.'''
2
+
3
+ import os
4
+ import logging
5
+
6
+ from openai import OpenAI
7
+
8
+
9
+ def summarize_content(content: str) -> str:
10
+ '''Generates summary of article content using Modal inference endpoint.
11
+
12
+ Args:
13
+ content: string containing the text content to be summarized
14
+
15
+ Returns:
16
+ Summarized text as string
17
+ '''
18
+
19
+ logger = logging.getLogger(__name__ + '.summarize_content')
20
+ logger.info('Summarizing extracted content')
21
+
22
+ client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
23
+
24
+ client.base_url = (
25
+ 'https://gperdrizet--vllm-openai-compatible-summarization-serve.modal.run/v1'
26
+ )
27
+
28
+ # Default to first avalible model
29
+ model = client.models.list().data[0]
30
+ model_id = model.id
31
+
32
+ # messages = [
33
+ # {
34
+ # 'role': 'system',
35
+ # 'content': ('You are a research assistant, skilled in summarizing documents in just '+
36
+ # 'a few sentences. Your document summaries should be a maximum of 2 to 4 sentences long.'),
37
+ # 'role': 'user',
38
+ # 'content': content
39
+ # }
40
+ # ]
41
+
42
+ messages = [
43
+ {
44
+ 'role': 'system',
45
+ 'content': f'Summarize the following text in 50 words returning only the summary: {content}'
46
+ }
47
+ ]
48
+
49
+ completion_args = {
50
+ 'model': model_id,
51
+ 'messages': messages,
52
+ # "frequency_penalty": args.frequency_penalty,
53
+ # "max_tokens": 128,
54
+ # "n": args.n,
55
+ # "presence_penalty": args.presence_penalty,
56
+ # "seed": args.seed,
57
+ # "stop": args.stop,
58
+ # "stream": args.stream,
59
+ # "temperature": args.temperature,
60
+ # "top_p": args.top_p,
61
+ }
62
+
63
+ try:
64
+ response = client.chat.completions.create(**completion_args)
65
+
66
+ except Exception as e: # pylint: disable=broad-exception-caught
67
+ response = None
68
+ logger.error('Error during Modal API call: %s', e)
69
+
70
+ if response is not None:
71
+ return response.choices[0].message.content
72
+
73
+ else:
74
+ return None
functions/tools.py CHANGED
@@ -2,30 +2,42 @@
2
 
3
  import json
4
  import logging
5
- import functions.helper_functions as helper_funcs
 
6
 
7
 
8
- def get_content(website: str) -> list:
9
- '''Gets RSS feed content from a given website.
 
 
 
10
 
11
  Args:
12
- website_url: URL or nam of website to extract RSS feed content from
13
 
14
  Returns:
15
- List of titles for the 10 most recent entries in the RSS feed from the
16
- requested website.
17
  '''
18
 
19
  logger = logging.getLogger(__name__ + '.get_content')
20
  logger.info('Getting feed content for: %s', website)
21
 
22
- feed_uri = helper_funcs.find_feed_uri(website)
23
  logger.info('find_feed_uri() returned %s', feed_uri)
24
 
25
  if 'No feed found' in feed_uri:
26
  return 'No feed found'
27
 
28
- content = helper_funcs.parse_feed(feed_uri)
29
  logger.info('parse_feed() returned %s entries', len(list(content.keys())))
30
 
 
 
 
 
 
 
 
 
31
  return json.dumps(content)
 
2
 
3
  import json
4
  import logging
5
+ import functions.feed_extraction as extraction_funcs
6
+ import functions.summarization as summarization_funcs
7
 
8
 
9
+ def get_feed(website: str) -> list:
10
+ '''Gets RSS feed content from a given website. Can take a website or RSS
11
+ feed URL directly, or the name of a website. Will attempt to find RSS
12
+ feed and return title, summary and link to full article for most recent
13
+ items in feed
14
 
15
  Args:
16
+ website: URL or name of website to extract RSS feed content from
17
 
18
  Returns:
19
+ JSON string containing the feed content or 'No feed found' if a RSS
20
+ feed for the requested website could not be found
21
  '''
22
 
23
  logger = logging.getLogger(__name__ + '.get_content')
24
  logger.info('Getting feed content for: %s', website)
25
 
26
+ feed_uri = extraction_funcs.find_feed_uri(website)
27
  logger.info('find_feed_uri() returned %s', feed_uri)
28
 
29
  if 'No feed found' in feed_uri:
30
  return 'No feed found'
31
 
32
+ content = extraction_funcs.parse_feed(feed_uri)
33
  logger.info('parse_feed() returned %s entries', len(list(content.keys())))
34
 
35
+ for i, item in content.items():
36
+
37
+ if item['content'] is not None:
38
+ summary = summarization_funcs.summarize_content(item['content'])
39
+ content[i]['summary'] = summary
40
+
41
+ content[i].pop('content', None)
42
+
43
  return json.dumps(content)
requirements.txt CHANGED
@@ -3,4 +3,5 @@ feedparser
3
  findfeed
4
  googlesearch-python
5
  gradio
6
- mcp
 
 
3
  findfeed
4
  googlesearch-python
5
  gradio
6
+ mcp
7
+ openai
rss_server.py CHANGED
@@ -39,7 +39,7 @@ with gr.Blocks() as demo:
39
  submit_button = gr.Button('Submit')
40
 
41
  submit_button.click( # pylint: disable=no-member
42
- fn=tool_funcs.get_content,
43
  inputs=website_url,
44
  outputs=output,
45
  api_name='Get RSS feed content'
 
39
  submit_button = gr.Button('Submit')
40
 
41
  submit_button.click( # pylint: disable=no-member
42
+ fn=tool_funcs.get_feed,
43
  inputs=website_url,
44
  outputs=output,
45
  api_name='Get RSS feed content'