gperdrizet commited on
Commit
e97f932
·
unverified ·
1 Parent(s): 3937afc

Added page content extraction and HTML cleaning functions.

Browse files
functions/helper_functions.py CHANGED
@@ -1,12 +1,71 @@
1
  '''Helper functions for MCP tools.'''
2
 
 
3
  import logging
4
- from types import GeneratorType
 
5
 
6
  import feedparser
 
 
7
  from findfeed import search as feed_search
8
  from googlesearch import search as google_search
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def get_url(company_name: str) -> str:
11
  '''Finds the website associated with the name of a company or
12
  publication.
@@ -66,18 +125,146 @@ def parse_feed(feed_uri: str) -> list:
66
  logger = logging.getLogger(__name__ + '.parse_feed')
67
 
68
  feed = feedparser.parse(feed_uri)
69
- logger.info('%s yieled %s entries', feed_uri, len(feed.entries))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- titles = []
 
72
 
73
- for entry in feed.entries:
 
74
 
75
- logger.debug('Entry attributes: %s', list(entry.keys()))
76
 
77
- if 'title' in entry:
78
- titles.append(entry.title)
79
 
80
- if len(titles) >= 10:
81
  break
82
 
83
- return titles
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  '''Helper functions for MCP tools.'''
2
 
3
+ import re
4
  import logging
5
+ import urllib.request
6
+ from urllib.error import HTTPError, URLError
7
 
8
  import feedparser
9
+ from boilerpy3 import extractors
10
+ from boilerpy3.exceptions import HTMLExtractionError
11
  from findfeed import search as feed_search
12
  from googlesearch import search as google_search
13
 
14
+ FEED_URIS = {}
15
+ RSS_EXTENSIONS = ['xml', 'rss', 'atom']
16
+ COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
17
+
18
+
19
+ def find_feed_uri(website: str) -> str:
20
+ '''Attempts to find URI for RSS feed. First checks if string provided in
21
+ website is a feed URI, it it's not, checks if website is a URL, if so,
22
+ uses that to find the RSS feed URI. If the provided string is neither,
23
+ defaults to Google search to find website URL and then uses that to try
24
+ and find the Feed.
25
+
26
+ Args:
27
+ website: target resource to find RSS feed URI for, can be website URL or
28
+ name of website
29
+
30
+ Returns:
31
+ RSS feed URI for website
32
+ '''
33
+
34
+ logger = logging.getLogger(__name__ + '.find_feed_uri')
35
+ logger.info('Finding feed URI for %s', website)
36
+
37
+ # Find the feed URI
38
+ feed_uri = None
39
+
40
+ # If the website contains xml, rss or atom, assume it's an RSS URI
41
+ if any(extension in website.lower() for extension in RSS_EXTENSIONS):
42
+ feed_uri = website
43
+ logger.info('%s looks like a feed URI already - using it directly', website)
44
+
45
+ # Next, check the cache to see if we already have this feed's URI
46
+ elif website in FEED_URIS:
47
+ feed_uri = FEED_URIS[website]
48
+ logger.info('%s feed URI in cache: %s', website, feed_uri)
49
+
50
+ # If neither of those get it - try feedparse if it looks like a url
51
+ # or else just google it
52
+ else:
53
+ if website.split('.')[-1] in COMMON_EXTENSIONS:
54
+ website_url = website
55
+ logger.info('%s looks like a website URL', website)
56
+
57
+ else:
58
+ website_url = get_url(website)
59
+ logger.info('Google result for %s: %s', website, website_url)
60
+
61
+ feed_uri = get_feed(website_url)
62
+ logger.info('get_feed() returned %s', feed_uri)
63
+
64
+ FEED_URIS[website] = feed_uri
65
+
66
+ return feed_uri
67
+
68
+
69
  def get_url(company_name: str) -> str:
70
  '''Finds the website associated with the name of a company or
71
  publication.
 
125
  logger = logging.getLogger(__name__ + '.parse_feed')
126
 
127
  feed = feedparser.parse(feed_uri)
128
+ logger.info('%s yielded %s entries', feed_uri, len(feed.entries))
129
+
130
+ entries = {}
131
+
132
+ for i, entry in enumerate(feed.entries):
133
+
134
+ entry_content = {}
135
+
136
+ if 'title' in entry and 'link' in entry:
137
+
138
+ entry_content['title'] = entry.title
139
+ entry_content['link'] = entry.link
140
+
141
+ entry_content['updated'] = None
142
+ entry_content['summary'] = None
143
+ entry_content['content'] = None
144
+
145
+ if 'updated' in entry:
146
+ entry_content['updated'] = entry.updated
147
+
148
+ if 'summary' in entry:
149
+ summary = get_text(entry.summary)
150
+ entry_content['summary'] = summary
151
 
152
+ if 'content' in entry:
153
+ entry_content['content'] = entry.content
154
 
155
+ html = get_html(entry_content['link'])
156
+ content = get_text(html)
157
 
158
+ entry_content['extracted_content'] = content
159
 
160
+ entries[i] = entry_content
 
161
 
162
+ if i == 9:
163
  break
164
 
165
+ logger.info('Entries contains %s elements', len(list(entries.keys())))
166
+
167
+ return entries
168
+
169
+
170
+ def get_html(url: str) -> str:
171
+ '''Gets HTML string content from url
172
+
173
+ Args:
174
+ url: the webpage to extract content from
175
+
176
+ Returns:
177
+ Webpage HTML source as string
178
+ '''
179
+
180
+ header={
181
+ "Accept": ("text/html,application/xhtml+xml,application/xml;q=0.9,image/avif," +
182
+ "image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"),
183
+ "Accept-Language": "en-US,en;q=0.9",
184
+ "Connection": "keep-alive",
185
+ "Sec-Fetch-Site": "cross-site",
186
+ "Sec-Fetch-User": "?1",
187
+ "Upgrade-Insecure-Requests": "1",
188
+ "User-Agent": ("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36" +
189
+ "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
190
+ }
191
+
192
+ # Create the request with header
193
+ request_params = urllib.request.Request(
194
+ url=url,
195
+ headers=header
196
+ )
197
+
198
+ # Get the html string
199
+ try:
200
+ with urllib.request.urlopen(request_params) as response:
201
+ status_code = response.getcode()
202
+
203
+ if status_code == 200:
204
+ content = response.read()
205
+ encoding = response.headers.get_content_charset()
206
+
207
+ if encoding is None:
208
+ encoding = "utf-8"
209
+
210
+ content = content.decode(encoding)
211
+
212
+ except HTTPError:
213
+ content = None
214
+
215
+ except URLError:
216
+ content = None
217
+
218
+ return content
219
+
220
+
221
+ def get_text(html: str) -> str:
222
+ '''Uses boilerpy3 extractor and regex cribbed from old NLTK clean_html
223
+ function to try and extract text from HTML as cleanly as possible.
224
+
225
+ Args:
226
+ html: the HTML string to be cleaned
227
+
228
+ Returns:
229
+ Cleaned text string'''
230
+
231
+ extractor = extractors.ArticleExtractor()
232
+
233
+ try:
234
+ html = extractor.get_content(html)
235
+
236
+ except HTMLExtractionError:
237
+ pass
238
+
239
+
240
+ return clean_html(html)
241
+
242
+
243
+ def clean_html(html: str) -> str:
244
+ '''
245
+ Remove HTML markup from the given string.
246
+
247
+ Args:
248
+ html: the HTML string to be cleaned
249
+
250
+ Returns:
251
+ Cleaned string
252
+ '''
253
+
254
+ # First we remove inline JavaScript/CSS:
255
+ cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
256
+
257
+ # Then we remove html comments. This has to be done before removing regular
258
+ # tags since comments can contain '>' characters.
259
+ cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
260
+
261
+ # Next we can remove the remaining tags:
262
+ cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
263
+
264
+
265
+ # Finally, we deal with whitespace
266
+ cleaned = re.sub(r"&nbsp;", " ", cleaned)
267
+ cleaned = re.sub(r" ", " ", cleaned)
268
+ cleaned = re.sub(r" ", " ", cleaned)
269
+
270
+ return cleaned.strip()
functions/tools.py CHANGED
@@ -1,13 +1,9 @@
1
  '''Tool functions for MCP server'''
2
 
 
3
  import logging
4
- from urllib.parse import urlparse
5
  import functions.helper_functions as helper_funcs
6
 
7
- FEED_URIS = {}
8
- RSS_EXTENSIONS = ['xml', 'rss', 'atom']
9
- COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
10
-
11
 
12
  def get_content(website: str) -> list:
13
  '''Gets RSS feed content from a given website.
@@ -23,36 +19,13 @@ def get_content(website: str) -> list:
23
  logger = logging.getLogger(__name__ + '.get_content')
24
  logger.info('Getting feed content for: %s', website)
25
 
26
- # Find the feed URI
27
- feed_uri = None
28
-
29
- # If the website contains xml, rss or atom, assume it's an RSS URI
30
- if any(extension in website.lower() for extension in RSS_EXTENSIONS):
31
- feed_uri = website
32
- logger.info('%s looks like a feed URI already - using it directly', website)
33
-
34
- # Next, check the cache to see if we alreay have this feed's URI
35
- elif website in FEED_URIS.keys():
36
- feed_uri = FEED_URIS[website]
37
- logger.info('%s feed URI in cache: %s', website, feed_uri)
38
-
39
- # If neither of those get it - try feedparse if it looks like a url
40
- # or else just google it
41
- else:
42
- if website.split('.')[-1] in COMMON_EXTENSIONS:
43
- website_url = website
44
- logger.info('%s looks like a website URL', website)
45
-
46
- else:
47
- website_url = helper_funcs.get_url(website)
48
- logger.info('Google result for %s: %s', website, website_url)
49
-
50
- feed_uri = helper_funcs.get_feed(website_url)
51
- logger.info('get_feed() returned %s', feed_uri)
52
 
53
- FEED_URIS[website] = feed_uri
 
54
 
55
  content = helper_funcs.parse_feed(feed_uri)
56
- logger.info('parse_feed() returned %s', content)
57
 
58
- return '\n'.join(content)
 
1
  '''Tool functions for MCP server'''
2
 
3
+ import json
4
  import logging
 
5
  import functions.helper_functions as helper_funcs
6
 
 
 
 
 
7
 
8
  def get_content(website: str) -> list:
9
  '''Gets RSS feed content from a given website.
 
19
  logger = logging.getLogger(__name__ + '.get_content')
20
  logger.info('Getting feed content for: %s', website)
21
 
22
+ feed_uri = helper_funcs.find_feed_uri(website)
23
+ logger.info('find_feed_uri() returned %s', feed_uri)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ if 'No feed found' in feed_uri:
26
+ return 'No feed found'
27
 
28
  content = helper_funcs.parse_feed(feed_uri)
29
+ logger.info('parse_feed() returned %s entries', len(list(content.keys())))
30
 
31
+ return json.dumps(content)
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  feedparser
2
  findfeed
3
  googlesearch-python
 
1
+ boilerpy3
2
  feedparser
3
  findfeed
4
  googlesearch-python