Spaces:
Running
Running
File size: 7,259 Bytes
7c3be27 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 |
# extract_news.py
# This script is designed to extract news articles from various sources, including NewsAPI and Google News RSS using the URLs saved from the gather_news.py file.
# It includes functions for extracting clean,full-text content from the articles, and storing the metadata into a file.
# Article Scraping & Text Extraction
from newspaper import Article
import pandas as pd
import logging
import requests
from bs4 import BeautifulSoup
# * For each URL from NewsAPI or RSS, * Create Article(url)* Call .download(), .parse(), .text and * Optionally use .nlp() to get summary and keywords
def extract_full_content(url, min_length=300):
"""
Extract full content and title from the given URL using newspaper3k.
Always returns a tuple (content, title) or (None, None).
"""
try:
article = Article(url)
article.download()
article.parse()
text = article.text.strip()
title = article.title.strip() if article.title else "Untitled"
# Filter out short content
if len(text) < min_length:
logging.warning(f"Extracted content is too short from {url}.")
return None, None
return text, title
except Exception as e:
logging.error(f"Failed to extract content from {url}: {str(e)}")
return None, None
def extract_full_content_rss(url, min_length=300):
"""
Extract full content and title from an RSS article using BeautifulSoup.
Always returns a tuple: (text, title) or (None, None).
"""
try:
response = requests.get(url, timeout=10)
if response.status_code != 200:
logging.error(f"Error fetching URL {url}: {response.status_code}")
return None, None
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.title.string.strip() if soup.title and soup.title.string else "Untitled"
paragraphs = soup.find_all('p')
text = ' '.join([para.get_text() for para in paragraphs]).strip()
if len(text) < min_length:
logging.warning(f"Extracted content is too short from {url}.")
return None, None
return text, title
except Exception as e:
logging.error(f"Error extracting content from {url}: {str(e)}")
return None, None
# * Handle common edge cases such as * Paywalled content (skip or tag) and * Duplicate links or broken URLs
def is_paywalled(url):
"""
* Check if the URL is paywalled
"""
paywall_indicators = ['paywall', 'subscription', 'premium']
return any(indicator in url for indicator in paywall_indicators)
def is_paywalled_content(article):
"""
* Check if the article is paywalled
"""
if not article:
return False
if not article.get("text"):
return False
if is_paywalled(article.get("url", "")):
return True
return False
def is_duplicate(url, existing_urls):
"""
* Check if the URL is a duplicate
"""
return url in existing_urls
def is_broken(url):
"""
* Check if the URL is broken
"""
try:
response = requests.head(url, allow_redirects=True)
return response.status_code != 200
except requests.RequestException:
return True
def is_valid_url(url):
"""
* Check if the URL is valid
"""
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return re.match(regex, url) is not None
def is_valid_url_content(url):
"""
* Check if the URL is valid
"""
if not url:
return False
if not is_valid_url(url):
return False
if is_paywalled(url):
return False
if is_broken(url):
return False
return True
# Additional functions to check if the article have empty content or blocked sites
def is_empty_content(article):
"""
* Check if the article content is empty
"""
if not article:
return True
if not article.get("text"):
return True
return False
def is_blocked_site(url):
"""
* Check if the URL is from a blocked site
"""
blocked_sites = ['example.com', 'blockedsite.com'] # Add your blocked sites here
return any(blocked_site in url for blocked_site in blocked_sites)
def is_blocked_content(article):
"""
* Check if the article is from a blocked site
"""
if not article:
return False
if not article.get("text"):
return False
if is_blocked_site(article.get("url", "")):
return True
return False
# Extract news articles from the given URLs
def extract_news_articles(urls):
"""
* Extract news articles from the given URLs
"""
extracted_articles = []
existing_urls = set()
for url in urls:
if not is_valid_url_content(url):
logging.warning(f"Skipping invalid or paywalled URL: {url}")
continue
if is_duplicate(url, existing_urls):
logging.warning(f"Skipping duplicate URL: {url}")
continue
existing_urls.add(url)
article = extract_full_content(url)
if not article:
logging.warning(f"Failed to extract content from {url}")
continue
if is_paywalled_content(article):
logging.warning(f"Skipping paywalled content from URL: {url}")
continue
extracted_articles.append(article)
return extracted_articles
def extract_news_articles_rss(urls):
"""
* Extract news articles from the given RSS URLs
"""
extracted_articles = []
existing_urls = set()
for url in urls:
if not is_valid_url_content(url):
logging.warning(f"Skipping invalid or paywalled URL: {url}")
continue
if is_duplicate(url, existing_urls):
logging.warning(f"Skipping duplicate URL: {url}")
continue
existing_urls.add(url)
article = extract_full_content_rss(url)
if not article:
logging.warning(f"Failed to extract content from {url}")
continue
if is_paywalled_content(article):
logging.warning(f"Skipping paywalled content from URL: {url}")
continue
extracted_articles.append(article)
return extracted_articles
# Metadata Structuring and Storage
# Functions to create a dataframe with all the metadata for extracted fields title,url,source,author, published_at and full_text for each extracted article and save it to a csv file
def create_dataframe(articles):
"""
Create a pandas DataFrame from the list of articles.
"""
return pd.DataFrame(articles)
def save_to_csv(df, filename):
"""
Save the DataFrame to a CSV file.
"""
df.to_csv(filename, index=False)
def save_to_json(df, filename):
"""
Save the DataFrame to a JSON file.
"""
df.to_json(filename, orient="records", lines=True) |