Spaces:

harao-ml
/

QuickPulse

Running

File size: 7,259 Bytes

7c3be27

# extract_news.py

# This script is designed to extract news articles from various sources, including NewsAPI and Google News RSS using the URLs saved from the gather_news.py file. 
# It includes functions for extracting  clean,full-text content from the articles, and storing the metadata into a file.


# Article Scraping & Text Extraction    

from newspaper import Article
import pandas as pd
import logging
import requests
from bs4 import BeautifulSoup


# * For each URL from NewsAPI or RSS, * Create Article(url)* Call .download(), .parse(), .text and * Optionally use .nlp() to get summary and keywords

def extract_full_content(url, min_length=300):
    """
    Extract full content and title from the given URL using newspaper3k.
    Always returns a tuple (content, title) or (None, None).
    """
    try:
        article = Article(url)
        article.download()
        article.parse()

        text = article.text.strip()
        title = article.title.strip() if article.title else "Untitled"

        # Filter out short content
        if len(text) < min_length:
            logging.warning(f"Extracted content is too short from {url}.")
            return None, None

        return text, title

    except Exception as e:
        logging.error(f"Failed to extract content from {url}: {str(e)}")
        return None, None

    
def extract_full_content_rss(url, min_length=300):
    """
    Extract full content and title from an RSS article using BeautifulSoup.
    Always returns a tuple: (text, title) or (None, None).
    """
    try:
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            logging.error(f"Error fetching URL {url}: {response.status_code}")
            return None, None

        soup = BeautifulSoup(response.content, 'html.parser')
        title = soup.title.string.strip() if soup.title and soup.title.string else "Untitled"
        paragraphs = soup.find_all('p')
        text = ' '.join([para.get_text() for para in paragraphs]).strip()

        if len(text) < min_length:
            logging.warning(f"Extracted content is too short from {url}.")
            return None, None

        return text, title

    except Exception as e:
        logging.error(f"Error extracting content from {url}: {str(e)}")
        return None, None


# * Handle common edge cases such as * Paywalled content (skip or tag) and * Duplicate links or broken URLs 
def is_paywalled(url):
    """
    * Check if the URL is paywalled
    """
    paywall_indicators = ['paywall', 'subscription', 'premium']
    return any(indicator in url for indicator in paywall_indicators)

def is_paywalled_content(article):
    """
    * Check if the article is paywalled
    """
    if not article:
        return False
    if not article.get("text"):
        return False
    if is_paywalled(article.get("url", "")):
        return True
    return False

def is_duplicate(url, existing_urls):
    """
    * Check if the URL is a duplicate
    """
    return url in existing_urls

def is_broken(url):
    """
    * Check if the URL is broken
    """
    try:
        response = requests.head(url, allow_redirects=True)
        return response.status_code != 200
    except requests.RequestException:
        return True
    
def is_valid_url(url):
    """
    * Check if the URL is valid
    """
    regex = re.compile(
        r'^(?:http|ftp)s?://'  # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
        r'localhost|'  # localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'  # ...or ipv4
        r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'  # ...or ipv6
        r'(?::\d+)?'  # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    return re.match(regex, url) is not None

def is_valid_url_content(url):
    """
    * Check if the URL is valid
    """
    if not url:
        return False
    if not is_valid_url(url):
        return False
    if is_paywalled(url):
        return False
    if is_broken(url):
        return False
    return True

# Additional functions to check if the article have empty content or blocked sites

def is_empty_content(article):
    """
    * Check if the article content is empty
    """
    if not article:
        return True
    if not article.get("text"):
        return True
    return False

def is_blocked_site(url):
    """
    * Check if the URL is from a blocked site
    """
    blocked_sites = ['example.com', 'blockedsite.com']  # Add your blocked sites here
    return any(blocked_site in url for blocked_site in blocked_sites)

def is_blocked_content(article):
    """
    * Check if the article is from a blocked site
    """
    if not article:
        return False
    if not article.get("text"):
        return False
    if is_blocked_site(article.get("url", "")):
        return True
    return False

#  Extract news articles from the given URLs

def extract_news_articles(urls):
    """
    * Extract news articles from the given URLs
    """
    extracted_articles = []
    existing_urls = set()

    for url in urls:
        if not is_valid_url_content(url):
            logging.warning(f"Skipping invalid or paywalled URL: {url}")
            continue
        if is_duplicate(url, existing_urls):
            logging.warning(f"Skipping duplicate URL: {url}")
            continue
        existing_urls.add(url)

        article = extract_full_content(url)
        if not article:
            logging.warning(f"Failed to extract content from {url}")
            continue

        if is_paywalled_content(article):
            logging.warning(f"Skipping paywalled content from URL: {url}")
            continue

        extracted_articles.append(article)

    return extracted_articles

def extract_news_articles_rss(urls):
    """
    * Extract news articles from the given RSS URLs
    """ 
    extracted_articles = []
    existing_urls = set()

    for url in urls:
        if not is_valid_url_content(url):
            logging.warning(f"Skipping invalid or paywalled URL: {url}")
            continue
        if is_duplicate(url, existing_urls):
            logging.warning(f"Skipping duplicate URL: {url}")
            continue
        existing_urls.add(url)

        article = extract_full_content_rss(url)
        if not article:
            logging.warning(f"Failed to extract content from {url}")
            continue

        if is_paywalled_content(article):
            logging.warning(f"Skipping paywalled content from URL: {url}")
            continue

        extracted_articles.append(article)

    return extracted_articles

# Metadata Structuring and Storage 
# Functions to create a dataframe with all the metadata for extracted fields title,url,source,author, published_at and full_text for each extracted article and save it to a csv file

def create_dataframe(articles):
    """
    Create a pandas DataFrame from the list of articles.
    """
    return pd.DataFrame(articles)

def save_to_csv(df, filename):
    """
    Save the DataFrame to a CSV file.
    """
    df.to_csv(filename, index=False)

def save_to_json(df, filename):
    """
    Save the DataFrame to a JSON file.
    """
    df.to_json(filename, orient="records", lines=True)