File size: 4,225 Bytes
22e1b62
38fd181
22e1b62
38fd181
 
 
 
 
 
22e1b62
b73a4fc
38fd181
 
 
b73a4fc
 
 
 
 
38fd181
b73a4fc
 
 
 
 
 
 
 
 
 
 
 
 
38fd181
1ce1659
38fd181
1ce1659
38fd181
1ce1659
38fd181
b73a4fc
22e1b62
 
 
 
38fd181
22e1b62
 
b73a4fc
22e1b62
 
 
b73a4fc
 
 
 
 
 
 
 
 
22e1b62
 
 
 
 
 
38fd181
22e1b62
 
b73a4fc
22e1b62
b73a4fc
 
 
38fd181
b73a4fc
38fd181
22e1b62
38fd181
00b1038
38fd181
b73a4fc
 
 
38fd181
b73a4fc
 
 
 
 
22e1b62
b73a4fc
 
 
 
 
 
 
 
 
38fd181
1ce1659
 
 
 
 
38fd181
 
 
 
b73a4fc
38fd181
1ce1659
38fd181
1ce1659
 
 
38fd181
1ce1659
 
 
 
b73a4fc
a6b0abd
38fd181
 
a6b0abd
 
 
38fd181
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import string

import requests
from bs4 import BeautifulSoup
from newspaper import (
    ArticleBinaryDataException,
    ArticleException,
    article,
)

from src.application.config import MAX_URL_SIZE


class URLReader:
    """
    A class to extract content (title, text, images) from a given URL.
    Supports two extraction methods: newspaper4k and BeautifulSoup.
    """

    def __init__(self, url: string, newspaper: bool = True):
        """
        Initializes the URLReader object.

        Args:
            url: The URL to extract content from.
            newspaper: True to use newspaper4k, False to use BeautifulSoup.
        """
        self.url: str = url
        self.text: str = None  # Extracted text content
        self.title: str = None  # Extracted title
        self.images: list[str] = None  # list of image URLs
        self.top_image: str = None  # URL of the top image
        self.is_extracted: bool = False  # Indicating successful extraction

        url_size = self.get_size()
        if url_size is None or url_size > MAX_URL_SIZE:
            return
        else:
            self.is_extracted = True

        self.newspaper = newspaper
        if self.newspaper is True:
            self.extract_content_newspaper()
        else:
            self.extract_content_bs()

    def extract_content_newspaper(self):
        """
        Extracts content from a URL using the newspaper4k library.
        """
        try:
            response = requests.get(self.url)
            response.raise_for_status()  # Raise HTTPError for bad responses

            news = article(url=self.url, fetch_images=True)

            self.title = news.title
            self.text = news.text
            self.images = list(set(news.images))  # Remove duplicates
            self.top_image = news.top_image

        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL: {e}")
            return None
        except (ArticleException, ArticleBinaryDataException) as e:
            print(f"\t\t↑↑↑ Error downloading article: {e}")
            return None

    def extract_content_bs(self):
        """
        Extracts content from a URL using BeautifulSoup.
        """
        try:
            response = requests.get(self.url)
            response.raise_for_status()

            response.encoding = response.apparent_encoding  # Detect encoding

            soup = BeautifulSoup(response.content, "html.parser")

            self.title = soup.title.string if soup.title else None

            image_urls = [img["src"] for img in soup.find_all("img")]
            self.images = image_urls
            self.top_image = self.images[0]

            # Remove unwanted elements from the HTML
            for element in soup(
                ["img", "figcaption", "table", "script", "style"],
            ):
                element.extract()

            paragraphs = soup.find_all("p")
            self.text = " ".join([p.get_text() for p in paragraphs])

        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL: {e}")
            return None
        except Exception as e:
            print(f"Error parsing HTML content from {self.url}: {e}")
            return None

    def get_size(self):
        """
        Retrieves the size of a URL's content using a HEAD request.
        """
        try:
            response = requests.head(
                self.url,
                allow_redirects=True,
                timeout=5,
            )
            response.raise_for_status()  # Raise HTTPError for bad responses

            content_length = response.headers.get("Content-Length")
            if content_length is not None:
                return int(content_length)
            else:
                print("\t\t↑↑↑ Content-Length header not found")
                return None

        except requests.exceptions.RequestException as e:
            print(f"\t\t↑↑↑ Error getting URL size: {e}")
            return None


if __name__ == "__main__":
    url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o"
    reader = URLReader(url)
    print(f"Title: {reader.title}")
    print(f"Text: {reader.text}")