Spaces:
Running
Running
File size: 4,225 Bytes
22e1b62 38fd181 22e1b62 38fd181 22e1b62 b73a4fc 38fd181 b73a4fc 38fd181 b73a4fc 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 38fd181 b73a4fc 22e1b62 38fd181 22e1b62 b73a4fc 22e1b62 b73a4fc 22e1b62 38fd181 22e1b62 b73a4fc 22e1b62 b73a4fc 38fd181 b73a4fc 38fd181 22e1b62 38fd181 00b1038 38fd181 b73a4fc 38fd181 b73a4fc 22e1b62 b73a4fc 38fd181 1ce1659 38fd181 b73a4fc 38fd181 1ce1659 38fd181 1ce1659 38fd181 1ce1659 b73a4fc a6b0abd 38fd181 a6b0abd 38fd181 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import string
import requests
from bs4 import BeautifulSoup
from newspaper import (
ArticleBinaryDataException,
ArticleException,
article,
)
from src.application.config import MAX_URL_SIZE
class URLReader:
"""
A class to extract content (title, text, images) from a given URL.
Supports two extraction methods: newspaper4k and BeautifulSoup.
"""
def __init__(self, url: string, newspaper: bool = True):
"""
Initializes the URLReader object.
Args:
url: The URL to extract content from.
newspaper: True to use newspaper4k, False to use BeautifulSoup.
"""
self.url: str = url
self.text: str = None # Extracted text content
self.title: str = None # Extracted title
self.images: list[str] = None # list of image URLs
self.top_image: str = None # URL of the top image
self.is_extracted: bool = False # Indicating successful extraction
url_size = self.get_size()
if url_size is None or url_size > MAX_URL_SIZE:
return
else:
self.is_extracted = True
self.newspaper = newspaper
if self.newspaper is True:
self.extract_content_newspaper()
else:
self.extract_content_bs()
def extract_content_newspaper(self):
"""
Extracts content from a URL using the newspaper4k library.
"""
try:
response = requests.get(self.url)
response.raise_for_status() # Raise HTTPError for bad responses
news = article(url=self.url, fetch_images=True)
self.title = news.title
self.text = news.text
self.images = list(set(news.images)) # Remove duplicates
self.top_image = news.top_image
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return None
except (ArticleException, ArticleBinaryDataException) as e:
print(f"\t\tβββ Error downloading article: {e}")
return None
def extract_content_bs(self):
"""
Extracts content from a URL using BeautifulSoup.
"""
try:
response = requests.get(self.url)
response.raise_for_status()
response.encoding = response.apparent_encoding # Detect encoding
soup = BeautifulSoup(response.content, "html.parser")
self.title = soup.title.string if soup.title else None
image_urls = [img["src"] for img in soup.find_all("img")]
self.images = image_urls
self.top_image = self.images[0]
# Remove unwanted elements from the HTML
for element in soup(
["img", "figcaption", "table", "script", "style"],
):
element.extract()
paragraphs = soup.find_all("p")
self.text = " ".join([p.get_text() for p in paragraphs])
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return None
except Exception as e:
print(f"Error parsing HTML content from {self.url}: {e}")
return None
def get_size(self):
"""
Retrieves the size of a URL's content using a HEAD request.
"""
try:
response = requests.head(
self.url,
allow_redirects=True,
timeout=5,
)
response.raise_for_status() # Raise HTTPError for bad responses
content_length = response.headers.get("Content-Length")
if content_length is not None:
return int(content_length)
else:
print("\t\tβββ Content-Length header not found")
return None
except requests.exceptions.RequestException as e:
print(f"\t\tβββ Error getting URL size: {e}")
return None
if __name__ == "__main__":
url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o"
reader = URLReader(url)
print(f"Title: {reader.title}")
print(f"Text: {reader.text}")
|