Merge pull request #8 from gperdrizet/dev
Browse files- assets/html.py +17 -2
- functions/helper_functions.py +11 -0
assets/html.py
CHANGED
@@ -3,9 +3,24 @@
|
|
3 |
TITLE = (
|
4 |
'''
|
5 |
<center>
|
6 |
-
<h1>RSS feed
|
7 |
</center>
|
8 |
'''
|
9 |
)
|
10 |
|
11 |
-
DESCRIPTION =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
TITLE = (
|
4 |
'''
|
5 |
<center>
|
6 |
+
<h1>RSS feed finder/extractor</h1>
|
7 |
</center>
|
8 |
'''
|
9 |
)
|
10 |
|
11 |
+
DESCRIPTION = (
|
12 |
+
'''
|
13 |
+
Functions to find and extract RSS feeds are complete-ish. No AI
|
14 |
+
yet, plan for tomorrow is to build two tools:
|
15 |
+
|
16 |
+
<ol>
|
17 |
+
<li>Human readable summaries of requested RSS feed</li>
|
18 |
+
<li>Simple RAG on requested RSS feed content</li>
|
19 |
+
</ol>
|
20 |
+
|
21 |
+
For now we just dump the extracted RSS content below. Try asking
|
22 |
+
for a feed by website name, website URL, or entering your favorite
|
23 |
+
feed URI directly. Suggestions: http://openai.com/news/rss.xml,
|
24 |
+
hackernews.com, Hugging Face, etc
|
25 |
+
'''
|
26 |
+
)
|
functions/helper_functions.py
CHANGED
@@ -209,6 +209,9 @@ def get_html(url: str) -> str:
|
|
209 |
|
210 |
content = content.decode(encoding)
|
211 |
|
|
|
|
|
|
|
212 |
except HTTPError:
|
213 |
content = None
|
214 |
|
@@ -227,6 +230,9 @@ def get_text(html: str) -> str:
|
|
227 |
|
228 |
Returns:
|
229 |
Cleaned text string'''
|
|
|
|
|
|
|
230 |
|
231 |
extractor = extractors.ArticleExtractor()
|
232 |
|
@@ -236,6 +242,11 @@ def get_text(html: str) -> str:
|
|
236 |
except HTMLExtractionError:
|
237 |
pass
|
238 |
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
return clean_html(html)
|
241 |
|
|
|
209 |
|
210 |
content = content.decode(encoding)
|
211 |
|
212 |
+
else:
|
213 |
+
content = None
|
214 |
+
|
215 |
except HTTPError:
|
216 |
content = None
|
217 |
|
|
|
230 |
|
231 |
Returns:
|
232 |
Cleaned text string'''
|
233 |
+
|
234 |
+
if html is None:
|
235 |
+
return None
|
236 |
|
237 |
extractor = extractors.ArticleExtractor()
|
238 |
|
|
|
242 |
except HTMLExtractionError:
|
243 |
pass
|
244 |
|
245 |
+
except AttributeError:
|
246 |
+
pass
|
247 |
+
|
248 |
+
except TypeError:
|
249 |
+
pass
|
250 |
|
251 |
return clean_html(html)
|
252 |
|