Spaces:
Running
Running
Upload 6 files
Browse files- analyze_sentiment.py +28 -0
- cluster_news.py +224 -0
- extract_news.py +244 -0
- gather_news.py +70 -0
- input_topic.py +50 -0
- summarizer.py +40 -0
analyze_sentiment.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# analyze_sentiment.py
|
2 |
+
|
3 |
+
# This script analyzes the sentiment of the summarized content using the Hugging Face Transformers library.
|
4 |
+
|
5 |
+
|
6 |
+
from transformers import pipeline
|
7 |
+
|
8 |
+
|
9 |
+
# Load sentiment analysis pipeline
|
10 |
+
sentiment_analyzer = pipeline("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")
|
11 |
+
|
12 |
+
def analyze_summary(summary):
|
13 |
+
"""
|
14 |
+
Analyze the sentiment of the given summary.
|
15 |
+
Returns a tuple of (sentiment, score).
|
16 |
+
"""
|
17 |
+
try:
|
18 |
+
if not summary.strip():
|
19 |
+
return "No input provided.", 0.0
|
20 |
+
|
21 |
+
result = sentiment_analyzer(summary)[0]
|
22 |
+
sentiment = result['label']
|
23 |
+
score = result['score']
|
24 |
+
|
25 |
+
return sentiment, score
|
26 |
+
except Exception as e:
|
27 |
+
return f"Error analyzing sentiment: {str(e)}", 0.0
|
28 |
+
# Example usage
|
cluster_news.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.cluster import KMeans
|
4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
+
from sklearn.decomposition import LatentDirichletAllocation
|
6 |
+
from sklearn.metrics import silhouette_score
|
7 |
+
from collections import defaultdict
|
8 |
+
from sentence_transformers import SentenceTransformer
|
9 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
10 |
+
|
11 |
+
|
12 |
+
def generate_embeddings(df, content_column):
|
13 |
+
"""
|
14 |
+
Generate embeddings for the content using SentenceTransformer.
|
15 |
+
"""
|
16 |
+
print("🔢 Generating embeddings for clustering...")
|
17 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
18 |
+
embeddings = model.encode(df[content_column].tolist(), show_progress_bar=True)
|
19 |
+
return embeddings
|
20 |
+
|
21 |
+
|
22 |
+
def determine_optimum_clusters(embeddings, min_clusters=2, max_clusters=10):
|
23 |
+
"""
|
24 |
+
Determine the optimum number of clusters using silhouette analysis.
|
25 |
+
"""
|
26 |
+
print("🔍 Determining the optimum number of clusters using silhouette analysis...")
|
27 |
+
n_samples = len(embeddings)
|
28 |
+
if n_samples < 2:
|
29 |
+
raise ValueError("Not enough samples to perform clustering. At least 2 samples are required.")
|
30 |
+
|
31 |
+
# Adjust max_clusters to ensure it does not exceed n_samples - 1
|
32 |
+
max_clusters = min(max_clusters, n_samples - 1)
|
33 |
+
|
34 |
+
best_num_clusters = min_clusters
|
35 |
+
best_score = -1
|
36 |
+
|
37 |
+
for n_clusters in range(min_clusters, max_clusters + 1):
|
38 |
+
try:
|
39 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
40 |
+
cluster_labels = kmeans.fit_predict(embeddings)
|
41 |
+
score = silhouette_score(embeddings, cluster_labels)
|
42 |
+
print(f"Number of clusters: {n_clusters}, Silhouette Score: {score:.4f}")
|
43 |
+
|
44 |
+
if score > best_score:
|
45 |
+
best_score = score
|
46 |
+
best_num_clusters = n_clusters
|
47 |
+
except ValueError as e:
|
48 |
+
print(f"Skipping {n_clusters} clusters due to error: {e}")
|
49 |
+
|
50 |
+
print(f"✅ Optimum number of clusters determined: {best_num_clusters}")
|
51 |
+
return best_num_clusters
|
52 |
+
|
53 |
+
|
54 |
+
def cluster_embeddings(embeddings, num_clusters):
|
55 |
+
"""
|
56 |
+
Perform KMeans clustering on the embeddings.
|
57 |
+
"""
|
58 |
+
print(f"📊 Clustering articles into {num_clusters} clusters using KMeans...")
|
59 |
+
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
|
60 |
+
kmeans.fit(embeddings)
|
61 |
+
return kmeans.labels_, kmeans
|
62 |
+
|
63 |
+
|
64 |
+
def extract_tfidf_labels(df, content_column, cluster_labels):
|
65 |
+
"""
|
66 |
+
Extract top TF-IDF keywords for each cluster.
|
67 |
+
"""
|
68 |
+
print("🔠 Extracting TF-IDF-based keywords for cluster labels...")
|
69 |
+
grouped = defaultdict(list)
|
70 |
+
for idx, label in enumerate(cluster_labels):
|
71 |
+
grouped[label].append(df.iloc[idx][content_column])
|
72 |
+
|
73 |
+
tfidf_labels = {}
|
74 |
+
for cluster_id, texts in grouped.items():
|
75 |
+
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=50)
|
76 |
+
tfidf_matrix = vectorizer.fit_transform(texts)
|
77 |
+
avg_tfidf = tfidf_matrix.mean(axis=0).A1
|
78 |
+
top_indices = np.argsort(avg_tfidf)[::-1][:3]
|
79 |
+
top_terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
|
80 |
+
tfidf_labels[cluster_id] = ", ".join(top_terms)
|
81 |
+
|
82 |
+
return tfidf_labels
|
83 |
+
|
84 |
+
def apply_topic_modeling(df, content_column, cluster_labels, num_topics=2):
|
85 |
+
"""
|
86 |
+
Apply topic modeling (LDA) within each cluster to refine and describe topics.
|
87 |
+
"""
|
88 |
+
print("🔍 Applying topic modeling within each cluster...")
|
89 |
+
grouped = defaultdict(list)
|
90 |
+
for idx, label in enumerate(cluster_labels):
|
91 |
+
grouped[label].append(df.iloc[idx][content_column])
|
92 |
+
|
93 |
+
topic_labels = {}
|
94 |
+
for cluster_id, texts in grouped.items():
|
95 |
+
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=5000)
|
96 |
+
tfidf_matrix = vectorizer.fit_transform(texts)
|
97 |
+
|
98 |
+
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
|
99 |
+
lda.fit(tfidf_matrix)
|
100 |
+
|
101 |
+
# Extract top words for each topic
|
102 |
+
feature_names = vectorizer.get_feature_names_out()
|
103 |
+
topics = []
|
104 |
+
for topic_idx, topic in enumerate(lda.components_):
|
105 |
+
top_indices = topic.argsort()[:-4:-1]
|
106 |
+
topics.append(", ".join([feature_names[i] for i in top_indices]))
|
107 |
+
topic_labels[cluster_id] = " | ".join(topics)
|
108 |
+
|
109 |
+
return topic_labels
|
110 |
+
|
111 |
+
|
112 |
+
def filter_similar_topics(topic_keywords_list, threshold=0.75):
|
113 |
+
"""
|
114 |
+
Filter out similar topics based on cosine similarity of their embeddings.
|
115 |
+
"""
|
116 |
+
print("🔄 Filtering similar topics...")
|
117 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
118 |
+
topic_sentences = [", ".join(kw) for kw in topic_keywords_list]
|
119 |
+
embeddings = model.encode(topic_sentences)
|
120 |
+
unique_indices = []
|
121 |
+
for i, emb in enumerate(embeddings):
|
122 |
+
if all(cosine_similarity([emb], [embeddings[j]])[0][0] < threshold for j in unique_indices):
|
123 |
+
unique_indices.append(i)
|
124 |
+
return [topic_keywords_list[i] for i in unique_indices]
|
125 |
+
|
126 |
+
|
127 |
+
def get_representative_summaries(df, summary_column, embeddings, cluster_labels, kmeans):
|
128 |
+
"""
|
129 |
+
Get the most representative summary for each cluster based on proximity to the cluster centroid.
|
130 |
+
"""
|
131 |
+
print("🔄 Refining cluster labels using representative summaries...")
|
132 |
+
representatives = {}
|
133 |
+
for i in range(kmeans.n_clusters):
|
134 |
+
indices = [j for j, label in enumerate(cluster_labels) if label == i]
|
135 |
+
if not indices:
|
136 |
+
continue
|
137 |
+
cluster_embeddings = embeddings[indices]
|
138 |
+
centroid = kmeans.cluster_centers_[i]
|
139 |
+
distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
|
140 |
+
closest_idx = indices[np.argmin(distances)]
|
141 |
+
representatives[i] = df.iloc[closest_idx][summary_column]
|
142 |
+
|
143 |
+
return representatives
|
144 |
+
|
145 |
+
|
146 |
+
def cluster_and_label_articles(df, content_column="content", summary_column="summary", min_clusters=2, max_clusters=10, max_topics=3):
|
147 |
+
"""
|
148 |
+
Cluster articles using SentenceTransformer embeddings and label clusters using TF-IDF and Topic Modeling.
|
149 |
+
Display detected topics for each cluster with Primary focus and Related topics.
|
150 |
+
"""
|
151 |
+
if df.empty:
|
152 |
+
print("No articles to cluster.")
|
153 |
+
return None
|
154 |
+
|
155 |
+
# Step 1: Generate embeddings
|
156 |
+
embeddings = generate_embeddings(df, content_column)
|
157 |
+
|
158 |
+
# Step 2: Determine the optimum number of clusters
|
159 |
+
num_clusters = determine_optimum_clusters(embeddings, min_clusters, max_clusters)
|
160 |
+
|
161 |
+
# Step 3: Perform clustering
|
162 |
+
cluster_labels, kmeans = cluster_embeddings(embeddings, num_clusters)
|
163 |
+
df['cluster_label'] = cluster_labels
|
164 |
+
|
165 |
+
# Step 4: Extract TF-IDF matrix
|
166 |
+
print("🔠 Extracting TF-IDF matrix for clusters...")
|
167 |
+
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=5000)
|
168 |
+
tfidf_matrix = vectorizer.fit_transform(df[content_column].tolist())
|
169 |
+
feature_names = vectorizer.get_feature_names_out()
|
170 |
+
|
171 |
+
# Step 5: Process each cluster
|
172 |
+
print("🔍 Processing clusters for TF-IDF and topic modeling...")
|
173 |
+
grouped = defaultdict(list)
|
174 |
+
for idx, label in enumerate(cluster_labels):
|
175 |
+
grouped[label].append(idx)
|
176 |
+
|
177 |
+
refined_labels = [""] * num_clusters # Initialize refined_labels with empty strings
|
178 |
+
detected_topics = {}
|
179 |
+
for cluster_id, indices in grouped.items():
|
180 |
+
cluster_texts = tfidf_matrix[indices]
|
181 |
+
|
182 |
+
# Extract TF-IDF keywords
|
183 |
+
avg_tfidf = cluster_texts.mean(axis=0).A1
|
184 |
+
top_indices = np.argsort(avg_tfidf)[::-1][:3]
|
185 |
+
tfidf_keywords = [feature_names[i] for i in top_indices]
|
186 |
+
|
187 |
+
# Generate a cluster label using the top TF-IDF keywords
|
188 |
+
cluster_label_tfidf = ", ".join(tfidf_keywords)
|
189 |
+
|
190 |
+
# Apply topic modeling
|
191 |
+
lda = LatentDirichletAllocation(n_components=min(max_topics, len(indices)), random_state=42)
|
192 |
+
lda.fit(cluster_texts)
|
193 |
+
topics = []
|
194 |
+
topic_weights = []
|
195 |
+
for topic_idx, topic in enumerate(lda.components_):
|
196 |
+
top_topic_indices = topic.argsort()[:-4:-1]
|
197 |
+
topics.append(", ".join([feature_names[i] for i in top_topic_indices]))
|
198 |
+
topic_weights.append(topic.sum()) # Sum of weights for ranking
|
199 |
+
|
200 |
+
# Rank topics by importance
|
201 |
+
ranked_topics = [x for _, x in sorted(zip(topic_weights, topics), reverse=True)]
|
202 |
+
|
203 |
+
# Generate Primary focus and Related topics
|
204 |
+
primary_focus = ranked_topics[0] if ranked_topics else "N/A"
|
205 |
+
related_topics = ranked_topics[1:] if len(ranked_topics) > 1 else []
|
206 |
+
|
207 |
+
# Store detected topics for user display
|
208 |
+
detected_topics[cluster_label_tfidf] = {
|
209 |
+
"primary_focus": primary_focus,
|
210 |
+
"related_topics": related_topics,
|
211 |
+
}
|
212 |
+
|
213 |
+
# Assign the TF-IDF keywords as the cluster label
|
214 |
+
refined_labels[cluster_id] = cluster_label_tfidf
|
215 |
+
|
216 |
+
# Assign refined labels to clusters
|
217 |
+
df['cluster_label'] = [refined_labels[label] for label in cluster_labels]
|
218 |
+
|
219 |
+
print("✅ Clustering and labeling complete!")
|
220 |
+
return {
|
221 |
+
"dataframe": df,
|
222 |
+
"detected_topics": detected_topics,
|
223 |
+
"number_of_clusters": num_clusters,
|
224 |
+
}
|
extract_news.py
ADDED
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# extract_news.py
|
2 |
+
|
3 |
+
# This script is designed to extract news articles from various sources, including NewsAPI and Google News RSS using the URLs saved from the gather_news.py file.
|
4 |
+
# It includes functions for extracting clean,full-text content from the articles, and storing the metadata into a file.
|
5 |
+
|
6 |
+
|
7 |
+
# Article Scraping & Text Extraction
|
8 |
+
|
9 |
+
from newspaper import Article
|
10 |
+
import pandas as pd
|
11 |
+
import logging
|
12 |
+
import requests
|
13 |
+
from bs4 import BeautifulSoup
|
14 |
+
|
15 |
+
|
16 |
+
# * For each URL from NewsAPI or RSS, * Create Article(url)* Call .download(), .parse(), .text and * Optionally use .nlp() to get summary and keywords
|
17 |
+
|
18 |
+
def extract_full_content(url, min_length=300):
|
19 |
+
"""
|
20 |
+
Extract full content and title from the given URL using newspaper3k.
|
21 |
+
Always returns a tuple (content, title) or (None, None).
|
22 |
+
"""
|
23 |
+
try:
|
24 |
+
article = Article(url)
|
25 |
+
article.download()
|
26 |
+
article.parse()
|
27 |
+
|
28 |
+
text = article.text.strip()
|
29 |
+
title = article.title.strip() if article.title else "Untitled"
|
30 |
+
|
31 |
+
# Filter out short content
|
32 |
+
if len(text) < min_length:
|
33 |
+
logging.warning(f"Extracted content is too short from {url}.")
|
34 |
+
return None, None
|
35 |
+
|
36 |
+
return text, title
|
37 |
+
|
38 |
+
except Exception as e:
|
39 |
+
logging.error(f"Failed to extract content from {url}: {str(e)}")
|
40 |
+
return None, None
|
41 |
+
|
42 |
+
|
43 |
+
def extract_full_content_rss(url, min_length=300):
|
44 |
+
"""
|
45 |
+
Extract full content and title from an RSS article using BeautifulSoup.
|
46 |
+
Always returns a tuple: (text, title) or (None, None).
|
47 |
+
"""
|
48 |
+
try:
|
49 |
+
response = requests.get(url, timeout=10)
|
50 |
+
if response.status_code != 200:
|
51 |
+
logging.error(f"Error fetching URL {url}: {response.status_code}")
|
52 |
+
return None, None
|
53 |
+
|
54 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
55 |
+
title = soup.title.string.strip() if soup.title and soup.title.string else "Untitled"
|
56 |
+
paragraphs = soup.find_all('p')
|
57 |
+
text = ' '.join([para.get_text() for para in paragraphs]).strip()
|
58 |
+
|
59 |
+
if len(text) < min_length:
|
60 |
+
logging.warning(f"Extracted content is too short from {url}.")
|
61 |
+
return None, None
|
62 |
+
|
63 |
+
return text, title
|
64 |
+
|
65 |
+
except Exception as e:
|
66 |
+
logging.error(f"Error extracting content from {url}: {str(e)}")
|
67 |
+
return None, None
|
68 |
+
|
69 |
+
|
70 |
+
# * Handle common edge cases such as * Paywalled content (skip or tag) and * Duplicate links or broken URLs
|
71 |
+
def is_paywalled(url):
|
72 |
+
"""
|
73 |
+
* Check if the URL is paywalled
|
74 |
+
"""
|
75 |
+
paywall_indicators = ['paywall', 'subscription', 'premium']
|
76 |
+
return any(indicator in url for indicator in paywall_indicators)
|
77 |
+
|
78 |
+
def is_paywalled_content(article):
|
79 |
+
"""
|
80 |
+
* Check if the article is paywalled
|
81 |
+
"""
|
82 |
+
if not article:
|
83 |
+
return False
|
84 |
+
if not article.get("text"):
|
85 |
+
return False
|
86 |
+
if is_paywalled(article.get("url", "")):
|
87 |
+
return True
|
88 |
+
return False
|
89 |
+
|
90 |
+
def is_duplicate(url, existing_urls):
|
91 |
+
"""
|
92 |
+
* Check if the URL is a duplicate
|
93 |
+
"""
|
94 |
+
return url in existing_urls
|
95 |
+
|
96 |
+
def is_broken(url):
|
97 |
+
"""
|
98 |
+
* Check if the URL is broken
|
99 |
+
"""
|
100 |
+
try:
|
101 |
+
response = requests.head(url, allow_redirects=True)
|
102 |
+
return response.status_code != 200
|
103 |
+
except requests.RequestException:
|
104 |
+
return True
|
105 |
+
|
106 |
+
def is_valid_url(url):
|
107 |
+
"""
|
108 |
+
* Check if the URL is valid
|
109 |
+
"""
|
110 |
+
regex = re.compile(
|
111 |
+
r'^(?:http|ftp)s?://' # http:// or https://
|
112 |
+
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
|
113 |
+
r'localhost|' # localhost...
|
114 |
+
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
|
115 |
+
r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
|
116 |
+
r'(?::\d+)?' # optional port
|
117 |
+
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
|
118 |
+
return re.match(regex, url) is not None
|
119 |
+
|
120 |
+
def is_valid_url_content(url):
|
121 |
+
"""
|
122 |
+
* Check if the URL is valid
|
123 |
+
"""
|
124 |
+
if not url:
|
125 |
+
return False
|
126 |
+
if not is_valid_url(url):
|
127 |
+
return False
|
128 |
+
if is_paywalled(url):
|
129 |
+
return False
|
130 |
+
if is_broken(url):
|
131 |
+
return False
|
132 |
+
return True
|
133 |
+
|
134 |
+
# Additional functions to check if the article have empty content or blocked sites
|
135 |
+
|
136 |
+
def is_empty_content(article):
|
137 |
+
"""
|
138 |
+
* Check if the article content is empty
|
139 |
+
"""
|
140 |
+
if not article:
|
141 |
+
return True
|
142 |
+
if not article.get("text"):
|
143 |
+
return True
|
144 |
+
return False
|
145 |
+
|
146 |
+
def is_blocked_site(url):
|
147 |
+
"""
|
148 |
+
* Check if the URL is from a blocked site
|
149 |
+
"""
|
150 |
+
blocked_sites = ['example.com', 'blockedsite.com'] # Add your blocked sites here
|
151 |
+
return any(blocked_site in url for blocked_site in blocked_sites)
|
152 |
+
|
153 |
+
def is_blocked_content(article):
|
154 |
+
"""
|
155 |
+
* Check if the article is from a blocked site
|
156 |
+
"""
|
157 |
+
if not article:
|
158 |
+
return False
|
159 |
+
if not article.get("text"):
|
160 |
+
return False
|
161 |
+
if is_blocked_site(article.get("url", "")):
|
162 |
+
return True
|
163 |
+
return False
|
164 |
+
|
165 |
+
# Extract news articles from the given URLs
|
166 |
+
|
167 |
+
def extract_news_articles(urls):
|
168 |
+
"""
|
169 |
+
* Extract news articles from the given URLs
|
170 |
+
"""
|
171 |
+
extracted_articles = []
|
172 |
+
existing_urls = set()
|
173 |
+
|
174 |
+
for url in urls:
|
175 |
+
if not is_valid_url_content(url):
|
176 |
+
logging.warning(f"Skipping invalid or paywalled URL: {url}")
|
177 |
+
continue
|
178 |
+
if is_duplicate(url, existing_urls):
|
179 |
+
logging.warning(f"Skipping duplicate URL: {url}")
|
180 |
+
continue
|
181 |
+
existing_urls.add(url)
|
182 |
+
|
183 |
+
article = extract_full_content(url)
|
184 |
+
if not article:
|
185 |
+
logging.warning(f"Failed to extract content from {url}")
|
186 |
+
continue
|
187 |
+
|
188 |
+
if is_paywalled_content(article):
|
189 |
+
logging.warning(f"Skipping paywalled content from URL: {url}")
|
190 |
+
continue
|
191 |
+
|
192 |
+
extracted_articles.append(article)
|
193 |
+
|
194 |
+
return extracted_articles
|
195 |
+
|
196 |
+
def extract_news_articles_rss(urls):
|
197 |
+
"""
|
198 |
+
* Extract news articles from the given RSS URLs
|
199 |
+
"""
|
200 |
+
extracted_articles = []
|
201 |
+
existing_urls = set()
|
202 |
+
|
203 |
+
for url in urls:
|
204 |
+
if not is_valid_url_content(url):
|
205 |
+
logging.warning(f"Skipping invalid or paywalled URL: {url}")
|
206 |
+
continue
|
207 |
+
if is_duplicate(url, existing_urls):
|
208 |
+
logging.warning(f"Skipping duplicate URL: {url}")
|
209 |
+
continue
|
210 |
+
existing_urls.add(url)
|
211 |
+
|
212 |
+
article = extract_full_content_rss(url)
|
213 |
+
if not article:
|
214 |
+
logging.warning(f"Failed to extract content from {url}")
|
215 |
+
continue
|
216 |
+
|
217 |
+
if is_paywalled_content(article):
|
218 |
+
logging.warning(f"Skipping paywalled content from URL: {url}")
|
219 |
+
continue
|
220 |
+
|
221 |
+
extracted_articles.append(article)
|
222 |
+
|
223 |
+
return extracted_articles
|
224 |
+
|
225 |
+
# Metadata Structuring and Storage
|
226 |
+
# Functions to create a dataframe with all the metadata for extracted fields title,url,source,author, published_at and full_text for each extracted article and save it to a csv file
|
227 |
+
|
228 |
+
def create_dataframe(articles):
|
229 |
+
"""
|
230 |
+
Create a pandas DataFrame from the list of articles.
|
231 |
+
"""
|
232 |
+
return pd.DataFrame(articles)
|
233 |
+
|
234 |
+
def save_to_csv(df, filename):
|
235 |
+
"""
|
236 |
+
Save the DataFrame to a CSV file.
|
237 |
+
"""
|
238 |
+
df.to_csv(filename, index=False)
|
239 |
+
|
240 |
+
def save_to_json(df, filename):
|
241 |
+
"""
|
242 |
+
Save the DataFrame to a JSON file.
|
243 |
+
"""
|
244 |
+
df.to_json(filename, orient="records", lines=True)
|
gather_news.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# gather_news.py
|
2 |
+
|
3 |
+
|
4 |
+
# News Source Integration
|
5 |
+
# This script integrates with various news sources to fetch the latest articles from the specified news sources, extracts relevant information such as title, URL,Source,Author and Publish date.
|
6 |
+
|
7 |
+
import config
|
8 |
+
import requests
|
9 |
+
import feedparser
|
10 |
+
|
11 |
+
def fetch_articles_newsapi(topic):
|
12 |
+
"""
|
13 |
+
Fetch articles from NewsAPI based on the provided topic.
|
14 |
+
"""
|
15 |
+
url = 'https://newsapi.org/v2/everything'
|
16 |
+
params = {
|
17 |
+
'apiKey': config.api_key,
|
18 |
+
'language': 'en',
|
19 |
+
'q': topic,
|
20 |
+
'pageSize': 20
|
21 |
+
}
|
22 |
+
try:
|
23 |
+
response = requests.get(url, params=params)
|
24 |
+
if response.status_code != 200:
|
25 |
+
return f"Error: Failed to fetch news. Status code: {response.status_code}"
|
26 |
+
|
27 |
+
articles = response.json().get("articles", [])
|
28 |
+
if not articles:
|
29 |
+
return "No articles found."
|
30 |
+
|
31 |
+
# Extract relevant information from each article
|
32 |
+
extracted_articles = []
|
33 |
+
for article in articles:
|
34 |
+
extracted_articles.append({
|
35 |
+
"title": article.get("title", "No title"),
|
36 |
+
"url": article.get("url", "#"),
|
37 |
+
"source": article.get("source", {}).get("name", "Unknown"),
|
38 |
+
"author": article.get("author", "Unknown"),
|
39 |
+
"publishedAt": article.get("publishedAt", "Unknown")
|
40 |
+
})
|
41 |
+
|
42 |
+
return extracted_articles
|
43 |
+
except Exception as e:
|
44 |
+
return f"Error fetching news: {str(e)}"
|
45 |
+
|
46 |
+
def fetch_articles_google(topic):
|
47 |
+
"""
|
48 |
+
Fetch articles from Google News RSS feed based on the provided topic.
|
49 |
+
"""
|
50 |
+
rss_url = f'https://news.google.com/rss/search?q={topic}&hl=en-US&gl=US&ceid=US:en'
|
51 |
+
try:
|
52 |
+
feed = feedparser.parse(rss_url)
|
53 |
+
if not feed.entries:
|
54 |
+
return "No articles found."
|
55 |
+
|
56 |
+
# Extract relevant information from each article
|
57 |
+
extracted_articles = []
|
58 |
+
for entry in feed.entries[:20]: # Limit to top 20 articles
|
59 |
+
extracted_articles.append({
|
60 |
+
"title": entry.title,
|
61 |
+
"url": entry.link,
|
62 |
+
"source": entry.source.title if hasattr(entry, 'source') else "Unknown",
|
63 |
+
"author": entry.author if hasattr(entry, 'author') else "Unknown",
|
64 |
+
"publishedAt": entry.published if hasattr(entry, 'published') else "Unknown"
|
65 |
+
})
|
66 |
+
|
67 |
+
return extracted_articles
|
68 |
+
except Exception as e:
|
69 |
+
return f"Error fetching news: {str(e)}"
|
70 |
+
|
input_topic.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# input_topic.py
|
2 |
+
|
3 |
+
|
4 |
+
# Input Design
|
5 |
+
# This script is designed to take user input for a topic or a keyword and validate it before using it in a news summarization application.
|
6 |
+
|
7 |
+
def get_topic():
|
8 |
+
topic = input("Enter a topic to search for news articles: ")
|
9 |
+
if not topic:
|
10 |
+
print("No topic provided. Please enter a valid topic.")
|
11 |
+
return None
|
12 |
+
if len(topic) > 100: # Arbitrary limit for topic length
|
13 |
+
print("Topic is too long. Please enter a shorter topic.")
|
14 |
+
return None
|
15 |
+
if not topic.isascii():
|
16 |
+
print("Topic contains non-ASCII characters. Please use only ASCII characters.")
|
17 |
+
return None
|
18 |
+
if not topic.isprintable():
|
19 |
+
print("Topic contains non-printable characters. Please use only printable characters.")
|
20 |
+
return None
|
21 |
+
if topic[0].isdigit():
|
22 |
+
print("Topic should not start with a digit. Please enter a valid topic.")
|
23 |
+
return None
|
24 |
+
if topic[0] == ' ':
|
25 |
+
print("Topic should not start with a space. Please enter a valid topic.")
|
26 |
+
return None
|
27 |
+
# Normalize the input to lowercase and strip any leading/trailing whitespace.
|
28 |
+
topic = topic.lower().strip()
|
29 |
+
# Check for special characters and replace them with spaces.
|
30 |
+
special_chars = ['!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '=', '+', '{', '}', '[', ']', '|', ':', ';', '"', "'", '<', '>', ',', '.', '?', '/', '\\']
|
31 |
+
for char in special_chars:
|
32 |
+
topic = topic.replace(char, ' ')
|
33 |
+
# Remove extra spaces
|
34 |
+
topic = ' '.join(topic.split())
|
35 |
+
# Check if the topic is empty after normalization
|
36 |
+
if not topic:
|
37 |
+
print("Topic is empty after normalization. Please enter a valid topic.")
|
38 |
+
return None
|
39 |
+
# Check for common stop words and remove them
|
40 |
+
stop_words = ['the', 'is', 'in', 'and', 'to', 'a', 'of', 'for', 'on', 'with', 'as', 'by', 'this', 'that']
|
41 |
+
topic_words = topic.split()
|
42 |
+
topic = ' '.join([word for word in topic_words if word not in stop_words])
|
43 |
+
# Check if the topic is empty after removing stop words
|
44 |
+
if not topic:
|
45 |
+
print("Topic is empty after removing stop words. Please enter a valid topic.")
|
46 |
+
return None
|
47 |
+
|
48 |
+
return topic
|
49 |
+
|
50 |
+
|
summarizer.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# summarizer.py
|
2 |
+
# This script summarizes the content of each article of the specified topic using the Hugging Face Transformers library.
|
3 |
+
|
4 |
+
from transformers import pipeline
|
5 |
+
|
6 |
+
# Load summarization pipeline
|
7 |
+
summarizer = pipeline("summarization", model="harao-ml/flant5-finetuned-summarize")
|
8 |
+
|
9 |
+
# Load once globally
|
10 |
+
|
11 |
+
#from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
12 |
+
|
13 |
+
#tokenizer = AutoTokenizer.from_pretrained("flant5-base")
|
14 |
+
#model = AutoModelForSeq2SeqLM.from_pretrained("flant5-base")
|
15 |
+
#summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
|
16 |
+
|
17 |
+
# Function to split text into smaller chunks
|
18 |
+
def split_text(text, max_tokens=512):
|
19 |
+
words = text.split()
|
20 |
+
for i in range(0, len(words), max_tokens):
|
21 |
+
yield ' '.join(words[i:i + max_tokens])
|
22 |
+
|
23 |
+
# Function to clean text
|
24 |
+
def clean_text(text):
|
25 |
+
text = ' '.join(text.split())
|
26 |
+
text = ' '.join(word for word in text.split() if len(word) < 100)
|
27 |
+
return text
|
28 |
+
|
29 |
+
def generate_summary(content):
|
30 |
+
try:
|
31 |
+
if not content.strip():
|
32 |
+
return "No input provided."
|
33 |
+
text = content
|
34 |
+
cleaned_text = clean_text(text)
|
35 |
+
chunks = list(split_text(cleaned_text))
|
36 |
+
cons_summary = ''.join([summarizer(chunk, do_sample=False)[0]['summary_text'] for chunk in chunks if chunk.strip()]) if chunks else ''
|
37 |
+
summary = summarizer(text, do_sample=False)[0]['summary_text']
|
38 |
+
return summary
|
39 |
+
except Exception as e:
|
40 |
+
return f"Error generating summary: {str(e)}"
|