QuickPulse / extract_news.py
harao-ml's picture
Upload 8 files
97420da verified
raw
history blame
1.19 kB
# extract_news.py
# This script extracts full content from news articles using the newspaper3k library.
import logging
import pandas as pd
from newspaper import Article
def extract_full_content(url, min_length=100):
try:
article = Article(url)
article.download()
article.parse()
text = article.text.strip()
title = article.title.strip() if article.title else "Untitled"
if len(text) < min_length:
logging.warning(f"Extracted content is too short from {url}.")
return None
return {"url": url, "text": text, "title": title}
except Exception as e:
logging.error(f"Failed to extract content from {url}: {str(e)}")
return None
def extract_news_articles(urls, min_length=100):
extracted_articles = []
for url in urls:
article = extract_full_content(url, min_length=min_length)
if article and article.get("text"):
article["original_url"] = url
extracted_articles.append(article)
return extracted_articles
def create_dataframe(articles):
return pd.DataFrame(articles)
def save_to_csv(df, filename):
df.to_csv(filename, index=False)