gperdrizet commited on
Commit
3937afc
·
unverified ·
1 Parent(s): f6ace8a

Switched to simple check for domain extension.

Browse files
functions/helper_functions.py CHANGED
@@ -20,6 +20,7 @@ def get_url(company_name: str) -> str:
20
  '''
21
 
22
  logger = logging.getLogger(__name__ + '.get_url')
 
23
 
24
  query = f'{company_name} official website'
25
 
@@ -45,10 +46,6 @@ def get_feed(website_url: str) -> str:
45
 
46
  feeds = feed_search(website_url)
47
 
48
- logger.info('Feeds search result is: %s', type(feeds))
49
- logger.info('Feeds search results: %s', len(feeds))
50
- logger.info('Feeds results: %s', list(feeds))
51
-
52
  if len(feeds) > 0:
53
  return str(feeds[0].url)
54
 
 
20
  '''
21
 
22
  logger = logging.getLogger(__name__ + '.get_url')
23
+ logger.info('Getting website URL for %s', company_name)
24
 
25
  query = f'{company_name} official website'
26
 
 
46
 
47
  feeds = feed_search(website_url)
48
 
 
 
 
 
49
  if len(feeds) > 0:
50
  return str(feeds[0].url)
51
 
functions/tools.py CHANGED
@@ -2,11 +2,11 @@
2
 
3
  import logging
4
  from urllib.parse import urlparse
5
- import validators
6
  import functions.helper_functions as helper_funcs
7
 
8
  FEED_URIS = {}
9
  RSS_EXTENSIONS = ['xml', 'rss', 'atom']
 
10
 
11
 
12
  def get_content(website: str) -> list:
@@ -39,7 +39,7 @@ def get_content(website: str) -> list:
39
  # If neither of those get it - try feedparse if it looks like a url
40
  # or else just google it
41
  else:
42
- if validators.url(website):
43
  website_url = website
44
  logger.info('%s looks like a website URL', website)
45
 
 
2
 
3
  import logging
4
  from urllib.parse import urlparse
 
5
  import functions.helper_functions as helper_funcs
6
 
7
  FEED_URIS = {}
8
  RSS_EXTENSIONS = ['xml', 'rss', 'atom']
9
+ COMMON_EXTENSIONS = ['com', 'net', 'org', 'edu', 'gov', 'co', 'us']
10
 
11
 
12
  def get_content(website: str) -> list:
 
39
  # If neither of those get it - try feedparse if it looks like a url
40
  # or else just google it
41
  else:
42
+ if website.split('.')[-1] in COMMON_EXTENSIONS:
43
  website_url = website
44
  logger.info('%s looks like a website URL', website)
45