Spaces:
Runtime error
Runtime error
Reorganized functions.
Browse files- functions/feed_extraction.py +55 -55
functions/feed_extraction.py
CHANGED
@@ -55,10 +55,10 @@ def find_feed_uri(website: str) -> str:
|
|
55 |
logger.info('%s looks like a website URL', website)
|
56 |
|
57 |
else:
|
58 |
-
website_url =
|
59 |
logger.info('Google result for %s: %s', website, website_url)
|
60 |
|
61 |
-
feed_uri =
|
62 |
logger.info('get_feed() returned %s', feed_uri)
|
63 |
|
64 |
FEED_URIS[website] = feed_uri
|
@@ -66,52 +66,6 @@ def find_feed_uri(website: str) -> str:
|
|
66 |
return feed_uri
|
67 |
|
68 |
|
69 |
-
def get_url(company_name: str) -> str:
|
70 |
-
'''Finds the website associated with the name of a company or
|
71 |
-
publication.
|
72 |
-
|
73 |
-
Args:
|
74 |
-
company_name: the name of the company, publication or site to find
|
75 |
-
the URL for
|
76 |
-
|
77 |
-
Returns:
|
78 |
-
The URL for the company, publication or website.
|
79 |
-
'''
|
80 |
-
|
81 |
-
logger = logging.getLogger(__name__ + '.get_url')
|
82 |
-
logger.info('Getting website URL for %s', company_name)
|
83 |
-
|
84 |
-
query = f'{company_name} official website'
|
85 |
-
|
86 |
-
for url in google_search(query, num_results=5):
|
87 |
-
if 'facebook' not in url and 'linkedin' not in url:
|
88 |
-
return url
|
89 |
-
|
90 |
-
return None
|
91 |
-
|
92 |
-
|
93 |
-
def get_feed(website_url: str) -> str:
|
94 |
-
'''Finds the RSS feed URI for a website given the website's url.
|
95 |
-
|
96 |
-
Args:
|
97 |
-
website_url: The url for the website to find the RSS feed for
|
98 |
-
|
99 |
-
Returns:
|
100 |
-
The website's RSS feed URI as a string
|
101 |
-
'''
|
102 |
-
|
103 |
-
logger = logging.getLogger(__name__ + '.get_content')
|
104 |
-
logger.info('Getting feed URI for: %s', website_url)
|
105 |
-
|
106 |
-
feeds = feed_search(website_url)
|
107 |
-
|
108 |
-
if len(feeds) > 0:
|
109 |
-
return str(feeds[0].url)
|
110 |
-
|
111 |
-
else:
|
112 |
-
return f'No feed found for {website_url}'
|
113 |
-
|
114 |
-
|
115 |
def parse_feed(feed_uri: str) -> list:
|
116 |
'''Gets content from a remote RSS feed URI.
|
117 |
|
@@ -146,14 +100,14 @@ def parse_feed(feed_uri: str) -> list:
|
|
146 |
entry_content['updated'] = entry.updated
|
147 |
|
148 |
if 'summary' in entry:
|
149 |
-
summary =
|
150 |
entry_content['summary'] = summary
|
151 |
|
152 |
if 'content' in entry:
|
153 |
entry_content['content'] = entry.content
|
154 |
|
155 |
-
html =
|
156 |
-
content =
|
157 |
|
158 |
entry_content['extracted_content'] = content
|
159 |
|
@@ -167,7 +121,53 @@ def parse_feed(feed_uri: str) -> list:
|
|
167 |
return entries
|
168 |
|
169 |
|
170 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
'''Gets HTML string content from url
|
172 |
|
173 |
Args:
|
@@ -221,7 +221,7 @@ def get_html(url: str) -> str:
|
|
221 |
return content
|
222 |
|
223 |
|
224 |
-
def
|
225 |
'''Uses boilerpy3 extractor and regex cribbed from old NLTK clean_html
|
226 |
function to try and extract text from HTML as cleanly as possible.
|
227 |
|
@@ -248,10 +248,10 @@ def get_text(html: str) -> str:
|
|
248 |
except TypeError:
|
249 |
pass
|
250 |
|
251 |
-
return
|
252 |
|
253 |
|
254 |
-
def
|
255 |
'''
|
256 |
Remove HTML markup from the given string.
|
257 |
|
|
|
55 |
logger.info('%s looks like a website URL', website)
|
56 |
|
57 |
else:
|
58 |
+
website_url = _get_url(website)
|
59 |
logger.info('Google result for %s: %s', website, website_url)
|
60 |
|
61 |
+
feed_uri = _get_feed(website_url)
|
62 |
logger.info('get_feed() returned %s', feed_uri)
|
63 |
|
64 |
FEED_URIS[website] = feed_uri
|
|
|
66 |
return feed_uri
|
67 |
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
def parse_feed(feed_uri: str) -> list:
|
70 |
'''Gets content from a remote RSS feed URI.
|
71 |
|
|
|
100 |
entry_content['updated'] = entry.updated
|
101 |
|
102 |
if 'summary' in entry:
|
103 |
+
summary = _get_text(entry.summary)
|
104 |
entry_content['summary'] = summary
|
105 |
|
106 |
if 'content' in entry:
|
107 |
entry_content['content'] = entry.content
|
108 |
|
109 |
+
html = _get_html(entry_content['link'])
|
110 |
+
content = _get_text(html)
|
111 |
|
112 |
entry_content['extracted_content'] = content
|
113 |
|
|
|
121 |
return entries
|
122 |
|
123 |
|
124 |
+
def _get_url(company_name: str) -> str:
|
125 |
+
'''Finds the website associated with the name of a company or
|
126 |
+
publication.
|
127 |
+
|
128 |
+
Args:
|
129 |
+
company_name: the name of the company, publication or site to find
|
130 |
+
the URL for
|
131 |
+
|
132 |
+
Returns:
|
133 |
+
The URL for the company, publication or website.
|
134 |
+
'''
|
135 |
+
|
136 |
+
logger = logging.getLogger(__name__ + '.get_url')
|
137 |
+
logger.info('Getting website URL for %s', company_name)
|
138 |
+
|
139 |
+
query = f'{company_name} official website'
|
140 |
+
|
141 |
+
for url in google_search(query, num_results=5):
|
142 |
+
if 'facebook' not in url and 'linkedin' not in url:
|
143 |
+
return url
|
144 |
+
|
145 |
+
return None
|
146 |
+
|
147 |
+
|
148 |
+
def _get_feed(website_url: str) -> str:
|
149 |
+
'''Finds the RSS feed URI for a website given the website's url.
|
150 |
+
|
151 |
+
Args:
|
152 |
+
website_url: The url for the website to find the RSS feed for
|
153 |
+
|
154 |
+
Returns:
|
155 |
+
The website's RSS feed URI as a string
|
156 |
+
'''
|
157 |
+
|
158 |
+
logger = logging.getLogger(__name__ + '.get_content')
|
159 |
+
logger.info('Getting feed URI for: %s', website_url)
|
160 |
+
|
161 |
+
feeds = feed_search(website_url)
|
162 |
+
|
163 |
+
if len(feeds) > 0:
|
164 |
+
return str(feeds[0].url)
|
165 |
+
|
166 |
+
else:
|
167 |
+
return f'No feed found for {website_url}'
|
168 |
+
|
169 |
+
|
170 |
+
def _get_html(url: str) -> str:
|
171 |
'''Gets HTML string content from url
|
172 |
|
173 |
Args:
|
|
|
221 |
return content
|
222 |
|
223 |
|
224 |
+
def _get_text(html: str) -> str:
|
225 |
'''Uses boilerpy3 extractor and regex cribbed from old NLTK clean_html
|
226 |
function to try and extract text from HTML as cleanly as possible.
|
227 |
|
|
|
248 |
except TypeError:
|
249 |
pass
|
250 |
|
251 |
+
return _clean_html(html)
|
252 |
|
253 |
|
254 |
+
def _clean_html(html: str) -> str:
|
255 |
'''
|
256 |
Remove HTML markup from the given string.
|
257 |
|