Spaces:
Running
Running
# extract_news.py | |
# This script extracts full content from news articles using the newspaper3k library. | |
import logging | |
import pandas as pd | |
from newspaper import Article | |
def extract_full_content(url, min_length=100): | |
try: | |
article = Article(url) | |
article.download() | |
article.parse() | |
text = article.text.strip() | |
title = article.title.strip() if article.title else "Untitled" | |
if len(text) < min_length: | |
logging.warning(f"Extracted content is too short from {url}.") | |
return None | |
return {"url": url, "text": text, "title": title} | |
except Exception as e: | |
logging.error(f"Failed to extract content from {url}: {str(e)}") | |
return None | |
def extract_news_articles(urls, min_length=100): | |
extracted_articles = [] | |
for url in urls: | |
article = extract_full_content(url, min_length=min_length) | |
if article and article.get("text"): | |
article["original_url"] = url | |
extracted_articles.append(article) | |
return extracted_articles | |
def create_dataframe(articles): | |
return pd.DataFrame(articles) | |
def save_to_csv(df, filename): | |
df.to_csv(filename, index=False) |