File size: 1,192 Bytes
7c3be27
97420da
7c3be27
 
97420da
 
7c3be27
97420da
7c3be27
 
 
 
 
 
 
 
97420da
 
7c3be27
 
97420da
7c3be27
97420da
7c3be27
 
97420da
 
 
 
7c3be27
 
 
 
 
 
97420da
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# extract_news.py
# This script extracts full content from news articles using the newspaper3k library.

import logging
import pandas as pd
from newspaper import Article

def extract_full_content(url, min_length=100):
    try:
        article = Article(url)
        article.download()
        article.parse()
        text = article.text.strip()
        title = article.title.strip() if article.title else "Untitled"
        if len(text) < min_length:
            logging.warning(f"Extracted content is too short from {url}.")
            return None
        return {"url": url, "text": text, "title": title}
    except Exception as e:
        logging.error(f"Failed to extract content from {url}: {str(e)}")
        return None

def extract_news_articles(urls, min_length=100):
    extracted_articles = []
    for url in urls:
        article = extract_full_content(url, min_length=min_length)
        if article and article.get("text"):
            article["original_url"] = url
            extracted_articles.append(article)
    return extracted_articles

def create_dataframe(articles):
    return pd.DataFrame(articles)

def save_to_csv(df, filename):
    df.to_csv(filename, index=False)