gperdrizet commited on
Commit
e51eea0
·
unverified ·
1 Parent(s): 687d26a

Reorganized functions.

Browse files
Files changed (1) hide show
  1. functions/feed_extraction.py +55 -55
functions/feed_extraction.py CHANGED
@@ -55,10 +55,10 @@ def find_feed_uri(website: str) -> str:
55
  logger.info('%s looks like a website URL', website)
56
 
57
  else:
58
- website_url = get_url(website)
59
  logger.info('Google result for %s: %s', website, website_url)
60
 
61
- feed_uri = get_feed(website_url)
62
  logger.info('get_feed() returned %s', feed_uri)
63
 
64
  FEED_URIS[website] = feed_uri
@@ -66,52 +66,6 @@ def find_feed_uri(website: str) -> str:
66
  return feed_uri
67
 
68
 
69
- def get_url(company_name: str) -> str:
70
- '''Finds the website associated with the name of a company or
71
- publication.
72
-
73
- Args:
74
- company_name: the name of the company, publication or site to find
75
- the URL for
76
-
77
- Returns:
78
- The URL for the company, publication or website.
79
- '''
80
-
81
- logger = logging.getLogger(__name__ + '.get_url')
82
- logger.info('Getting website URL for %s', company_name)
83
-
84
- query = f'{company_name} official website'
85
-
86
- for url in google_search(query, num_results=5):
87
- if 'facebook' not in url and 'linkedin' not in url:
88
- return url
89
-
90
- return None
91
-
92
-
93
- def get_feed(website_url: str) -> str:
94
- '''Finds the RSS feed URI for a website given the website's url.
95
-
96
- Args:
97
- website_url: The url for the website to find the RSS feed for
98
-
99
- Returns:
100
- The website's RSS feed URI as a string
101
- '''
102
-
103
- logger = logging.getLogger(__name__ + '.get_content')
104
- logger.info('Getting feed URI for: %s', website_url)
105
-
106
- feeds = feed_search(website_url)
107
-
108
- if len(feeds) > 0:
109
- return str(feeds[0].url)
110
-
111
- else:
112
- return f'No feed found for {website_url}'
113
-
114
-
115
  def parse_feed(feed_uri: str) -> list:
116
  '''Gets content from a remote RSS feed URI.
117
 
@@ -146,14 +100,14 @@ def parse_feed(feed_uri: str) -> list:
146
  entry_content['updated'] = entry.updated
147
 
148
  if 'summary' in entry:
149
- summary = get_text(entry.summary)
150
  entry_content['summary'] = summary
151
 
152
  if 'content' in entry:
153
  entry_content['content'] = entry.content
154
 
155
- html = get_html(entry_content['link'])
156
- content = get_text(html)
157
 
158
  entry_content['extracted_content'] = content
159
 
@@ -167,7 +121,53 @@ def parse_feed(feed_uri: str) -> list:
167
  return entries
168
 
169
 
170
- def get_html(url: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  '''Gets HTML string content from url
172
 
173
  Args:
@@ -221,7 +221,7 @@ def get_html(url: str) -> str:
221
  return content
222
 
223
 
224
- def get_text(html: str) -> str:
225
  '''Uses boilerpy3 extractor and regex cribbed from old NLTK clean_html
226
  function to try and extract text from HTML as cleanly as possible.
227
 
@@ -248,10 +248,10 @@ def get_text(html: str) -> str:
248
  except TypeError:
249
  pass
250
 
251
- return clean_html(html)
252
 
253
 
254
- def clean_html(html: str) -> str:
255
  '''
256
  Remove HTML markup from the given string.
257
 
 
55
  logger.info('%s looks like a website URL', website)
56
 
57
  else:
58
+ website_url = _get_url(website)
59
  logger.info('Google result for %s: %s', website, website_url)
60
 
61
+ feed_uri = _get_feed(website_url)
62
  logger.info('get_feed() returned %s', feed_uri)
63
 
64
  FEED_URIS[website] = feed_uri
 
66
  return feed_uri
67
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def parse_feed(feed_uri: str) -> list:
70
  '''Gets content from a remote RSS feed URI.
71
 
 
100
  entry_content['updated'] = entry.updated
101
 
102
  if 'summary' in entry:
103
+ summary = _get_text(entry.summary)
104
  entry_content['summary'] = summary
105
 
106
  if 'content' in entry:
107
  entry_content['content'] = entry.content
108
 
109
+ html = _get_html(entry_content['link'])
110
+ content = _get_text(html)
111
 
112
  entry_content['extracted_content'] = content
113
 
 
121
  return entries
122
 
123
 
124
+ def _get_url(company_name: str) -> str:
125
+ '''Finds the website associated with the name of a company or
126
+ publication.
127
+
128
+ Args:
129
+ company_name: the name of the company, publication or site to find
130
+ the URL for
131
+
132
+ Returns:
133
+ The URL for the company, publication or website.
134
+ '''
135
+
136
+ logger = logging.getLogger(__name__ + '.get_url')
137
+ logger.info('Getting website URL for %s', company_name)
138
+
139
+ query = f'{company_name} official website'
140
+
141
+ for url in google_search(query, num_results=5):
142
+ if 'facebook' not in url and 'linkedin' not in url:
143
+ return url
144
+
145
+ return None
146
+
147
+
148
+ def _get_feed(website_url: str) -> str:
149
+ '''Finds the RSS feed URI for a website given the website's url.
150
+
151
+ Args:
152
+ website_url: The url for the website to find the RSS feed for
153
+
154
+ Returns:
155
+ The website's RSS feed URI as a string
156
+ '''
157
+
158
+ logger = logging.getLogger(__name__ + '.get_content')
159
+ logger.info('Getting feed URI for: %s', website_url)
160
+
161
+ feeds = feed_search(website_url)
162
+
163
+ if len(feeds) > 0:
164
+ return str(feeds[0].url)
165
+
166
+ else:
167
+ return f'No feed found for {website_url}'
168
+
169
+
170
+ def _get_html(url: str) -> str:
171
  '''Gets HTML string content from url
172
 
173
  Args:
 
221
  return content
222
 
223
 
224
+ def _get_text(html: str) -> str:
225
  '''Uses boilerpy3 extractor and regex cribbed from old NLTK clean_html
226
  function to try and extract text from HTML as cleanly as possible.
227
 
 
248
  except TypeError:
249
  pass
250
 
251
+ return _clean_html(html)
252
 
253
 
254
+ def _clean_html(html: str) -> str:
255
  '''
256
  Remove HTML markup from the given string.
257