File size: 7,259 Bytes
7c3be27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# extract_news.py

# This script is designed to extract news articles from various sources, including NewsAPI and Google News RSS using the URLs saved from the gather_news.py file. 
# It includes functions for extracting  clean,full-text content from the articles, and storing the metadata into a file.


# Article Scraping & Text Extraction    

from newspaper import Article
import pandas as pd
import logging
import requests
from bs4 import BeautifulSoup


# * For each URL from NewsAPI or RSS, * Create Article(url)* Call .download(), .parse(), .text and * Optionally use .nlp() to get summary and keywords

def extract_full_content(url, min_length=300):
    """
    Extract full content and title from the given URL using newspaper3k.
    Always returns a tuple (content, title) or (None, None).
    """
    try:
        article = Article(url)
        article.download()
        article.parse()

        text = article.text.strip()
        title = article.title.strip() if article.title else "Untitled"

        # Filter out short content
        if len(text) < min_length:
            logging.warning(f"Extracted content is too short from {url}.")
            return None, None

        return text, title

    except Exception as e:
        logging.error(f"Failed to extract content from {url}: {str(e)}")
        return None, None

    
def extract_full_content_rss(url, min_length=300):
    """
    Extract full content and title from an RSS article using BeautifulSoup.
    Always returns a tuple: (text, title) or (None, None).
    """
    try:
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            logging.error(f"Error fetching URL {url}: {response.status_code}")
            return None, None

        soup = BeautifulSoup(response.content, 'html.parser')
        title = soup.title.string.strip() if soup.title and soup.title.string else "Untitled"
        paragraphs = soup.find_all('p')
        text = ' '.join([para.get_text() for para in paragraphs]).strip()

        if len(text) < min_length:
            logging.warning(f"Extracted content is too short from {url}.")
            return None, None

        return text, title

    except Exception as e:
        logging.error(f"Error extracting content from {url}: {str(e)}")
        return None, None


# * Handle common edge cases such as * Paywalled content (skip or tag) and * Duplicate links or broken URLs 
def is_paywalled(url):
    """
    * Check if the URL is paywalled
    """
    paywall_indicators = ['paywall', 'subscription', 'premium']
    return any(indicator in url for indicator in paywall_indicators)

def is_paywalled_content(article):
    """
    * Check if the article is paywalled
    """
    if not article:
        return False
    if not article.get("text"):
        return False
    if is_paywalled(article.get("url", "")):
        return True
    return False

def is_duplicate(url, existing_urls):
    """
    * Check if the URL is a duplicate
    """
    return url in existing_urls

def is_broken(url):
    """
    * Check if the URL is broken
    """
    try:
        response = requests.head(url, allow_redirects=True)
        return response.status_code != 200
    except requests.RequestException:
        return True
    
def is_valid_url(url):
    """
    * Check if the URL is valid
    """
    regex = re.compile(
        r'^(?:http|ftp)s?://'  # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
        r'localhost|'  # localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'  # ...or ipv4
        r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'  # ...or ipv6
        r'(?::\d+)?'  # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    return re.match(regex, url) is not None

def is_valid_url_content(url):
    """
    * Check if the URL is valid
    """
    if not url:
        return False
    if not is_valid_url(url):
        return False
    if is_paywalled(url):
        return False
    if is_broken(url):
        return False
    return True

# Additional functions to check if the article have empty content or blocked sites

def is_empty_content(article):
    """
    * Check if the article content is empty
    """
    if not article:
        return True
    if not article.get("text"):
        return True
    return False

def is_blocked_site(url):
    """
    * Check if the URL is from a blocked site
    """
    blocked_sites = ['example.com', 'blockedsite.com']  # Add your blocked sites here
    return any(blocked_site in url for blocked_site in blocked_sites)

def is_blocked_content(article):
    """
    * Check if the article is from a blocked site
    """
    if not article:
        return False
    if not article.get("text"):
        return False
    if is_blocked_site(article.get("url", "")):
        return True
    return False

#  Extract news articles from the given URLs

def extract_news_articles(urls):
    """
    * Extract news articles from the given URLs
    """
    extracted_articles = []
    existing_urls = set()

    for url in urls:
        if not is_valid_url_content(url):
            logging.warning(f"Skipping invalid or paywalled URL: {url}")
            continue
        if is_duplicate(url, existing_urls):
            logging.warning(f"Skipping duplicate URL: {url}")
            continue
        existing_urls.add(url)

        article = extract_full_content(url)
        if not article:
            logging.warning(f"Failed to extract content from {url}")
            continue

        if is_paywalled_content(article):
            logging.warning(f"Skipping paywalled content from URL: {url}")
            continue

        extracted_articles.append(article)

    return extracted_articles

def extract_news_articles_rss(urls):
    """
    * Extract news articles from the given RSS URLs
    """ 
    extracted_articles = []
    existing_urls = set()

    for url in urls:
        if not is_valid_url_content(url):
            logging.warning(f"Skipping invalid or paywalled URL: {url}")
            continue
        if is_duplicate(url, existing_urls):
            logging.warning(f"Skipping duplicate URL: {url}")
            continue
        existing_urls.add(url)

        article = extract_full_content_rss(url)
        if not article:
            logging.warning(f"Failed to extract content from {url}")
            continue

        if is_paywalled_content(article):
            logging.warning(f"Skipping paywalled content from URL: {url}")
            continue

        extracted_articles.append(article)

    return extracted_articles

# Metadata Structuring and Storage 
# Functions to create a dataframe with all the metadata for extracted fields title,url,source,author, published_at and full_text for each extracted article and save it to a csv file

def create_dataframe(articles):
    """
    Create a pandas DataFrame from the list of articles.
    """
    return pd.DataFrame(articles)

def save_to_csv(df, filename):
    """
    Save the DataFrame to a CSV file.
    """
    df.to_csv(filename, index=False)

def save_to_json(df, filename):
    """
    Save the DataFrame to a JSON file.
    """
    df.to_json(filename, orient="records", lines=True)