Spaces:
Running
Running
Upload 8 files
Browse files- analyze_sentiment.py +9 -13
- app.py +208 -174
- cluster_news.py +137 -192
- extract_news.py +13 -221
- gather_news.py +110 -62
- input_topic.py +0 -3
- requirements.txt +3 -0
- summarizer.py +0 -8
analyze_sentiment.py
CHANGED
@@ -1,28 +1,24 @@
|
|
1 |
# analyze_sentiment.py
|
2 |
-
|
3 |
# This script analyzes the sentiment of the summarized content using the Hugging Face Transformers library.
|
4 |
|
5 |
-
|
6 |
from transformers import pipeline
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
sentiment_analyzer = pipeline("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")
|
11 |
|
12 |
def analyze_summary(summary):
|
13 |
"""
|
14 |
-
Analyze the sentiment of the given summary.
|
15 |
Returns a tuple of (sentiment, score).
|
16 |
"""
|
17 |
try:
|
18 |
if not summary.strip():
|
19 |
return "No input provided.", 0.0
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
return sentiment, score
|
26 |
except Exception as e:
|
27 |
-
return f"Error analyzing sentiment: {str(e)}", 0.0
|
28 |
-
# Example usage
|
|
|
1 |
# analyze_sentiment.py
|
|
|
2 |
# This script analyzes the sentiment of the summarized content using the Hugging Face Transformers library.
|
3 |
|
|
|
4 |
from transformers import pipeline
|
5 |
|
6 |
+
# Load zero-shot classification pipeline
|
7 |
+
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
|
|
|
8 |
|
9 |
def analyze_summary(summary):
|
10 |
"""
|
11 |
+
Analyze the sentiment of the given summary using zero-shot classification.
|
12 |
Returns a tuple of (sentiment, score).
|
13 |
"""
|
14 |
try:
|
15 |
if not summary.strip():
|
16 |
return "No input provided.", 0.0
|
17 |
+
|
18 |
+
candidate_labels = ["positive", "neutral", "negative"]
|
19 |
+
result = classifier(summary, candidate_labels)
|
20 |
+
sentiment = result['labels'][0].capitalize()
|
21 |
+
score = float(result['scores'][0])
|
22 |
return sentiment, score
|
23 |
except Exception as e:
|
24 |
+
return f"Error analyzing sentiment: {str(e)}", 0.0
|
|
app.py
CHANGED
@@ -1,110 +1,123 @@
|
|
1 |
-
|
|
|
|
|
2 |
import pandas as pd
|
3 |
import cluster_news
|
4 |
-
import extract_news
|
5 |
import summarizer
|
6 |
import analyze_sentiment
|
7 |
-
import
|
|
|
|
|
8 |
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
def
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def fetch_and_process_latest_news(sentiment_filters):
|
24 |
-
|
25 |
-
articles
|
26 |
-
if isinstance(articles, str) or not articles:
|
27 |
-
return sentiment_filters, "### No latest news available", "", "", "", "", None
|
28 |
|
29 |
-
|
30 |
-
|
|
|
31 |
|
32 |
-
|
33 |
-
|
|
|
34 |
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
result = cluster_news.cluster_and_label_articles(df, content_column="content", summary_column="summary")
|
37 |
-
cluster_md_blocks =
|
38 |
-
csv_file, _ = save_clustered_articles(result["dataframe"],
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
return sentiment_filters, *cluster_md_blocks, csv_file
|
41 |
|
42 |
def extract_summarize_and_analyze_articles(articles):
|
43 |
extracted_articles = []
|
44 |
for article in articles:
|
45 |
-
|
46 |
-
if
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
return extracted_articles
|
63 |
|
64 |
-
def
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
84 |
|
85 |
def extract_summarize_and_analyze_content_from_urls(urls):
|
86 |
-
|
87 |
-
|
88 |
-
content, title = extract_news.extract_full_content(url)
|
89 |
-
if content: # Only proceed if content is successfully extracted
|
90 |
-
summary = summarizer.generate_summary(content)
|
91 |
-
sentiment, score = analyze_sentiment.analyze_summary(summary)
|
92 |
-
extracted_articles.append({
|
93 |
-
"title": title if title else "Untitled Article",
|
94 |
-
"url": url,
|
95 |
-
"source": "External Link",
|
96 |
-
"author": "Unknown",
|
97 |
-
"publishedAt": "Unknown",
|
98 |
-
"content": content,
|
99 |
-
"summary": summary,
|
100 |
-
"sentiment": sentiment,
|
101 |
-
"score": score
|
102 |
-
})
|
103 |
-
return extracted_articles
|
104 |
|
105 |
-
def
|
106 |
df = result["dataframe"]
|
107 |
-
|
|
|
108 |
df["sentiment"] = df["sentiment"].str.capitalize()
|
109 |
|
110 |
if sentiment_filters:
|
@@ -117,24 +130,50 @@ def display_clusters_as_columns(result, sentiment_filters=None):
|
|
117 |
markdown_blocks = []
|
118 |
|
119 |
for cluster_label, articles in clusters:
|
120 |
-
cluster_md = f"
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
markdown_blocks.append(cluster_md)
|
139 |
|
140 |
while len(markdown_blocks) < 5:
|
@@ -149,88 +188,88 @@ def save_clustered_articles(df, topic):
|
|
149 |
df.to_csv(csv_file, index=False)
|
150 |
return csv_file, None
|
151 |
|
152 |
-
|
153 |
-
|
154 |
-
def update_ui_with_columns(topic, files, urls, sentiment_filters):
|
155 |
extracted_articles = []
|
156 |
|
157 |
-
if topic.strip():
|
158 |
-
|
159 |
-
if articles:
|
160 |
-
extracted_articles.extend(extract_summarize_and_analyze_articles(articles))
|
161 |
-
|
162 |
-
if files:
|
163 |
-
extracted_articles.extend(extract_summarize_and_analyze_content_from_file(files))
|
164 |
|
165 |
if urls:
|
166 |
url_list = [url.strip() for url in urls.split("\n") if url.strip()]
|
167 |
extracted_articles.extend(extract_summarize_and_analyze_content_from_urls(url_list))
|
168 |
|
169 |
if not extracted_articles:
|
170 |
-
return sentiment_filters, "
|
171 |
|
172 |
-
|
|
|
173 |
result = cluster_news.cluster_and_label_articles(df, content_column="content", summary_column="summary")
|
174 |
-
cluster_md_blocks =
|
175 |
csv_file, _ = save_clustered_articles(result["dataframe"], topic or "batch_upload")
|
176 |
-
|
177 |
-
|
|
|
|
|
178 |
|
179 |
def clear_interface():
|
180 |
return (
|
181 |
"", # topic_input
|
182 |
["Positive", "Neutral", "Negative"],# sentiment_filter
|
183 |
-
gr.update(value=None), # uploaded_files (reset file upload)
|
184 |
"", # urls_input
|
185 |
"", "", "", "", "", # cluster columns 0–4
|
186 |
-
gr.update(value=None)
|
|
|
|
|
187 |
)
|
188 |
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
# Header Section
|
195 |
-
gr.Markdown("# 📰 Quick Pulse")
|
196 |
-
gr.Markdown("### AI-Powered News Summarization with Real-Time Sentiment and Topic Insights")
|
197 |
gr.Markdown(
|
198 |
-
"
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
with gr.Accordion("🗞️ Latest Top Headlines", open=False):
|
203 |
-
latest_news_button = gr.Button("Fetch & Summarize Top 10 Headlines")
|
204 |
-
|
205 |
-
with gr.Row():
|
206 |
-
topic_input = gr.Textbox(label="Enter Topic", placeholder="e.g. climate change")
|
207 |
-
sentiment_filter = gr.CheckboxGroup(choices=["Positive", "Neutral", "Negative"], value=["Positive", "Neutral", "Negative"], label="Sentiment Filter")
|
208 |
-
csv_output = gr.File(label="📁 Download Clustered Digest CSV")
|
209 |
-
|
210 |
-
with gr.Accordion("📂 Upload Articles (.txt files)", open=False):
|
211 |
-
uploaded_files = gr.File(label="Upload .txt Files", file_types=[".txt"], file_count="multiple")
|
212 |
-
|
213 |
-
with gr.Accordion("🔗 Enter Multiple URLs", open=False):
|
214 |
-
urls_input = gr.Textbox(label="Enter URLs (newline separated)", lines=4)
|
215 |
-
|
216 |
-
with gr.Row():
|
217 |
-
submit_button = gr.Button(" Generate Digest")
|
218 |
-
clear_button = gr.Button(" Clear")
|
219 |
|
220 |
with gr.Row():
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
|
227 |
submit_button.click(
|
228 |
fn=update_ui_with_columns,
|
229 |
-
inputs=[topic_input,
|
230 |
outputs=[
|
231 |
sentiment_filter,
|
232 |
column_0, column_1, column_2, column_3, column_4,
|
233 |
-
csv_output
|
|
|
|
|
234 |
]
|
235 |
)
|
236 |
|
@@ -240,28 +279,23 @@ with gr.Blocks(theme=gr.themes.Base(), css=".gr-markdown { margin: 10px; }") as
|
|
240 |
outputs=[
|
241 |
sentiment_filter,
|
242 |
column_0, column_1, column_2, column_3, column_4,
|
243 |
-
csv_output
|
|
|
|
|
244 |
]
|
245 |
)
|
246 |
|
247 |
clear_button.click(
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
column_3, # 8
|
259 |
-
column_4, # 9
|
260 |
-
csv_output # 10
|
261 |
-
]
|
262 |
-
)
|
263 |
-
|
264 |
-
|
265 |
|
266 |
if __name__ == "__main__":
|
267 |
-
demo.launch()
|
|
|
1 |
+
## This script provides a Gradio interface for gathering, clustering, summarizing, and analyzing news articles with sentiment analysis and topic modeling.
|
2 |
+
|
3 |
+
import gather_news
|
4 |
import pandas as pd
|
5 |
import cluster_news
|
|
|
6 |
import summarizer
|
7 |
import analyze_sentiment
|
8 |
+
import extract_news
|
9 |
+
import gradio as gr
|
10 |
+
import plotly.express as px
|
11 |
|
12 |
+
def plot_topic_frequency(result):
|
13 |
+
df = result["dataframe"]
|
14 |
+
topic_counts = df["cluster_label"].value_counts().reset_index()
|
15 |
+
topic_counts.columns = ["Topic", "Count"]
|
16 |
+
fig = px.bar(topic_counts, x="Topic", y="Count", title="Topic Frequency", color="Topic")
|
17 |
+
fig.update_layout(showlegend=False, height=350)
|
18 |
+
return fig
|
19 |
|
20 |
+
def plot_sentiment_trends(result):
|
21 |
+
df = result["dataframe"]
|
22 |
+
sentiment_counts = df["sentiment"].value_counts().reset_index()
|
23 |
+
sentiment_counts.columns = ["Sentiment", "Count"]
|
24 |
+
fig = px.pie(sentiment_counts, names="Sentiment", values="Count", title="Sentiment Distribution")
|
25 |
+
fig.update_traces(textinfo='label+percent')
|
26 |
+
fig.update_layout(height=350)
|
27 |
+
return fig
|
28 |
+
|
29 |
+
def render_top_clusters_table(result, top_n=5):
|
30 |
+
df = result["dataframe"]
|
31 |
+
cluster_counts = df["cluster_label"].value_counts().reset_index()
|
32 |
+
cluster_counts.columns = ["Cluster", "Articles"]
|
33 |
+
top_clusters = cluster_counts.head(top_n)
|
34 |
+
return top_clusters
|
35 |
|
36 |
def fetch_and_process_latest_news(sentiment_filters):
|
37 |
+
articles = gather_news.fetch_newsapi_top_headlines()
|
38 |
+
return process_and_display_articles(articles, sentiment_filters, "Top Headlines")
|
|
|
|
|
39 |
|
40 |
+
def fetch_and_process_topic_news(topic, sentiment_filters):
|
41 |
+
articles = gather_news.fetch_newsapi_everything(topic)
|
42 |
+
return process_and_display_articles(articles, sentiment_filters, topic or "Topic")
|
43 |
|
44 |
+
def process_and_display_articles(articles, sentiment_filters, topic_label):
|
45 |
+
if not articles:
|
46 |
+
return sentiment_filters, "", "", "", "", "", None, None, None, gr.update(visible=False)
|
47 |
|
48 |
+
articles = sorted(articles, key=lambda x: x.get("publishedAt", ""), reverse=True)
|
49 |
+
extracted_articles = extract_summarize_and_analyze_articles(articles)
|
50 |
+
deduped_articles = deduplicate_articles(extracted_articles)
|
51 |
+
if not deduped_articles:
|
52 |
+
return sentiment_filters, "", "", "", "", "", None, None, None, gr.update(visible=False)
|
53 |
+
|
54 |
+
df = pd.DataFrame(deduped_articles)
|
55 |
result = cluster_news.cluster_and_label_articles(df, content_column="content", summary_column="summary")
|
56 |
+
cluster_md_blocks = display_clusters_as_columns_grouped_by_sentiment(result, sentiment_filters)
|
57 |
+
csv_file, _ = save_clustered_articles(result["dataframe"], topic_label)
|
58 |
+
|
59 |
+
# Analytics
|
60 |
+
topic_fig = plot_topic_frequency(result)
|
61 |
+
sentiment_fig = plot_sentiment_trends(result)
|
62 |
+
top_clusters_table = render_top_clusters_table(result)
|
63 |
|
64 |
+
return sentiment_filters, *cluster_md_blocks, csv_file, topic_fig, sentiment_fig, top_clusters_table, gr.update(visible=True)
|
65 |
|
66 |
def extract_summarize_and_analyze_articles(articles):
|
67 |
extracted_articles = []
|
68 |
for article in articles:
|
69 |
+
content = article.get("text") or article.get("content")
|
70 |
+
if not content:
|
71 |
+
continue
|
72 |
+
title = article.get("title", "No title")
|
73 |
+
summary = summarizer.generate_summary(content)
|
74 |
+
sentiment, score = analyze_sentiment.analyze_summary(summary)
|
75 |
+
extracted_articles.append({
|
76 |
+
"title": title,
|
77 |
+
"url": article.get("url"),
|
78 |
+
"source": article.get("source", "Unknown"),
|
79 |
+
"author": article.get("author", "Unknown"),
|
80 |
+
"publishedAt": article.get("publishedAt", "Unknown"),
|
81 |
+
"content": content,
|
82 |
+
"summary": summary,
|
83 |
+
"sentiment": sentiment,
|
84 |
+
"score": score
|
85 |
+
})
|
86 |
return extracted_articles
|
87 |
|
88 |
+
def deduplicate_articles(articles):
|
89 |
+
seen_urls = set()
|
90 |
+
seen_title_source = set()
|
91 |
+
seen_title_summary = set()
|
92 |
+
deduped = []
|
93 |
+
for art in articles:
|
94 |
+
url = art.get("url")
|
95 |
+
title = art.get("title", "").strip().lower()
|
96 |
+
source = art.get("source", "").strip().lower()
|
97 |
+
summary = art.get("summary", "").strip().lower()
|
98 |
+
key_title_source = (title, source)
|
99 |
+
key_title_summary = (title, summary)
|
100 |
+
if url and url in seen_urls:
|
101 |
+
continue
|
102 |
+
if key_title_source in seen_title_source:
|
103 |
+
continue
|
104 |
+
if key_title_summary in seen_title_summary:
|
105 |
+
continue
|
106 |
+
deduped.append(art)
|
107 |
+
if url:
|
108 |
+
seen_urls.add(url)
|
109 |
+
seen_title_source.add(key_title_source)
|
110 |
+
seen_title_summary.add(key_title_summary)
|
111 |
+
return deduped
|
112 |
|
113 |
def extract_summarize_and_analyze_content_from_urls(urls):
|
114 |
+
articles = extract_news.extract_news_articles(urls)
|
115 |
+
return extract_summarize_and_analyze_articles(articles)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
+
def display_clusters_as_columns_grouped_by_sentiment(result, sentiment_filters=None):
|
118 |
df = result["dataframe"]
|
119 |
+
cluster_primary_topics = result.get("cluster_primary_topics", {})
|
120 |
+
cluster_related_topics = result.get("cluster_related_topics", {})
|
121 |
df["sentiment"] = df["sentiment"].str.capitalize()
|
122 |
|
123 |
if sentiment_filters:
|
|
|
130 |
markdown_blocks = []
|
131 |
|
132 |
for cluster_label, articles in clusters:
|
133 |
+
cluster_md = f"<div style='border:2px solid #e0e0e0; border-radius:10px; margin-bottom:18px; padding:18px; background: #f9f9fa;'>"
|
134 |
+
cluster_md += f"<h3 style='color:#2d6cdf;'>🧩 Cluster: {cluster_label}</h3>"
|
135 |
+
|
136 |
+
lda_topics = articles["lda_topics"].iloc[0] if "lda_topics" in articles else ""
|
137 |
+
if lda_topics:
|
138 |
+
cluster_md += f"<b style='color:#0d47a1;'>Main Themes:</b> <span style='color:#1976d2'>{lda_topics}</span><br>"
|
139 |
+
|
140 |
+
primary = cluster_primary_topics.get(cluster_label, [])
|
141 |
+
if primary:
|
142 |
+
cluster_md += f"<b style='color:#1b5e20;'>Primary Topics:</b> <span style='color:#388e3c'>{', '.join(primary)}</span><br>"
|
143 |
+
|
144 |
+
related = cluster_related_topics.get(cluster_label, [])
|
145 |
+
if related:
|
146 |
+
cluster_md += f"<b style='color:#616161;'>Related Topics:</b> <span style='color:#757575'>{', '.join(related)}</span><br>"
|
147 |
+
|
148 |
+
cluster_md += f"<b>Articles:</b> {len(articles)}<br><br>"
|
149 |
+
|
150 |
+
for sentiment in ["Positive", "Neutral", "Negative"]:
|
151 |
+
sentiment_articles = articles[articles["sentiment"] == sentiment]
|
152 |
+
if not sentiment_articles.empty:
|
153 |
+
color = {"Positive": "#e8f5e9", "Neutral": "#e3f2fd", "Negative": "#ffebee"}[sentiment]
|
154 |
+
border = {"Positive": "#43a047", "Neutral": "#1976d2", "Negative": "#c62828"}[sentiment]
|
155 |
+
sentiment_label = {
|
156 |
+
"Positive": "Positive News",
|
157 |
+
"Neutral": "Neutral News",
|
158 |
+
"Negative": "Negative News"
|
159 |
+
}[sentiment]
|
160 |
+
cluster_md += (
|
161 |
+
f"<div style='background:{color}; border-left:6px solid {border}; border-radius:6px; margin-bottom:10px; padding:10px;'>"
|
162 |
+
f"<span style='font-size:1.2em;'><b>{sentiment_label} ({len(sentiment_articles)})</b></span><br>"
|
163 |
+
)
|
164 |
+
for _, article in sentiment_articles.iterrows():
|
165 |
+
cluster_md += (
|
166 |
+
f"<div style='margin:10px 0 10px 0; padding:10px; border-bottom:1px solid #e0e0e0;'>"
|
167 |
+
f"<span style='font-weight:bold; color:#37474f;'>📰 {article['title']}</span><br>"
|
168 |
+
f"<span style='font-size:0.95em;'>"
|
169 |
+
f"<b>Source:</b> {article['source']}<br>"
|
170 |
+
f"<details><summary style='cursor:pointer; color:#1976d2;'><strong>Summary</strong></summary>"
|
171 |
+
f"<div style='margin-left:10px; color:#424242;'>{article['summary']}</div></details>"
|
172 |
+
f"<a href='{article['url']}' target='_blank' style='color:#1976d2;'>Read Full Article</a>"
|
173 |
+
f"</span></div>"
|
174 |
+
)
|
175 |
+
cluster_md += "</div>"
|
176 |
+
cluster_md += "</div>"
|
177 |
markdown_blocks.append(cluster_md)
|
178 |
|
179 |
while len(markdown_blocks) < 5:
|
|
|
188 |
df.to_csv(csv_file, index=False)
|
189 |
return csv_file, None
|
190 |
|
191 |
+
def update_ui_with_columns(topic, urls, sentiment_filters):
|
|
|
|
|
192 |
extracted_articles = []
|
193 |
|
194 |
+
if topic and topic.strip():
|
195 |
+
return fetch_and_process_topic_news(topic, sentiment_filters)
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
if urls:
|
198 |
url_list = [url.strip() for url in urls.split("\n") if url.strip()]
|
199 |
extracted_articles.extend(extract_summarize_and_analyze_content_from_urls(url_list))
|
200 |
|
201 |
if not extracted_articles:
|
202 |
+
return sentiment_filters, "", "", "", "", "", None, None, None, gr.update(visible=False)
|
203 |
|
204 |
+
deduped_articles = deduplicate_articles(extracted_articles)
|
205 |
+
df = pd.DataFrame(deduped_articles)
|
206 |
result = cluster_news.cluster_and_label_articles(df, content_column="content", summary_column="summary")
|
207 |
+
cluster_md_blocks = display_clusters_as_columns_grouped_by_sentiment(result, sentiment_filters)
|
208 |
csv_file, _ = save_clustered_articles(result["dataframe"], topic or "batch_upload")
|
209 |
+
topic_fig = plot_topic_frequency(result)
|
210 |
+
sentiment_fig = plot_sentiment_trends(result)
|
211 |
+
top_clusters_table = render_top_clusters_table(result)
|
212 |
+
return sentiment_filters, *cluster_md_blocks, csv_file, topic_fig, sentiment_fig, top_clusters_table, gr.update(visible=True)
|
213 |
|
214 |
def clear_interface():
|
215 |
return (
|
216 |
"", # topic_input
|
217 |
["Positive", "Neutral", "Negative"],# sentiment_filter
|
|
|
218 |
"", # urls_input
|
219 |
"", "", "", "", "", # cluster columns 0–4
|
220 |
+
gr.update(value=None), # csv_output (reset download file)
|
221 |
+
None, None, None, # topic_fig, sentiment_fig, top_clusters_table
|
222 |
+
gr.update(visible=False) # Hide Clustered News Digest section
|
223 |
)
|
224 |
|
225 |
+
with gr.Blocks(theme=gr.themes.Base(), css="""
|
226 |
+
.gr-markdown { margin: 10px; }
|
227 |
+
.analytics-card {background: #f5f7fa; border-radius: 10px; padding: 18px; margin-bottom: 18px;}
|
228 |
+
""") as demo:
|
|
|
|
|
|
|
|
|
229 |
gr.Markdown(
|
230 |
+
"<h1 style='text-align:center;'>📰 Quick Pulse</h1>"
|
231 |
+
"<h3 style='text-align:center; color:#1976d2;'>AI-Powered News Summarization with Real-Time Sentiment and Topic Insights</h3>"
|
232 |
+
"<p style='text-align:center;'>From headlines to insight, Quick Pulse summarizes news stories, captures emotional context, clusters related topics, and provides analytics at a glance.</p>"
|
233 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
|
235 |
with gr.Row():
|
236 |
+
with gr.Column(scale=2):
|
237 |
+
topic_input = gr.Textbox(label="Enter Topic", placeholder="e.g. climate change")
|
238 |
+
sentiment_filter = gr.CheckboxGroup(choices=["Positive", "Neutral", "Negative"], value=["Positive", "Neutral", "Negative"], label="Sentiment Filter")
|
239 |
+
with gr.Accordion("🔗 Enter Multiple URLs", open=False):
|
240 |
+
urls_input = gr.Textbox(label="Enter URLs (newline separated)", lines=4)
|
241 |
+
with gr.Row():
|
242 |
+
submit_button = gr.Button(" Generate Digest", scale=1)
|
243 |
+
latest_news_button = gr.Button("Fetch & Summarize Top News", scale=1)
|
244 |
+
clear_button = gr.Button(" Clear", scale=1)
|
245 |
+
csv_output = gr.File(label="📁 Download Clustered Digest CSV")
|
246 |
+
with gr.Column(scale=3):
|
247 |
+
with gr.Row():
|
248 |
+
topic_fig = gr.Plot(label="Topic Frequency")
|
249 |
+
sentiment_fig = gr.Plot(label="Sentiment Trends")
|
250 |
+
top_clusters_table = gr.Dataframe(label="Top Clusters")
|
251 |
+
|
252 |
+
gr.Markdown("---")
|
253 |
+
|
254 |
+
clustered_digest_section = gr.Group(visible=False)
|
255 |
+
with clustered_digest_section:
|
256 |
+
gr.Markdown("<h3 style='color:#1976d2;'>Clustered News Digest</h3>")
|
257 |
+
with gr.Row():
|
258 |
+
column_0 = gr.Markdown()
|
259 |
+
column_1 = gr.Markdown()
|
260 |
+
column_2 = gr.Markdown()
|
261 |
+
column_3 = gr.Markdown()
|
262 |
+
column_4 = gr.Markdown()
|
263 |
|
264 |
submit_button.click(
|
265 |
fn=update_ui_with_columns,
|
266 |
+
inputs=[topic_input, urls_input, sentiment_filter],
|
267 |
outputs=[
|
268 |
sentiment_filter,
|
269 |
column_0, column_1, column_2, column_3, column_4,
|
270 |
+
csv_output,
|
271 |
+
topic_fig, sentiment_fig, top_clusters_table,
|
272 |
+
clustered_digest_section
|
273 |
]
|
274 |
)
|
275 |
|
|
|
279 |
outputs=[
|
280 |
sentiment_filter,
|
281 |
column_0, column_1, column_2, column_3, column_4,
|
282 |
+
csv_output,
|
283 |
+
topic_fig, sentiment_fig, top_clusters_table,
|
284 |
+
clustered_digest_section
|
285 |
]
|
286 |
)
|
287 |
|
288 |
clear_button.click(
|
289 |
+
fn=clear_interface,
|
290 |
+
inputs=[],
|
291 |
+
outputs=[
|
292 |
+
topic_input, sentiment_filter, urls_input,
|
293 |
+
column_0, column_1, column_2, column_3, column_4,
|
294 |
+
csv_output,
|
295 |
+
topic_fig, sentiment_fig, top_clusters_table,
|
296 |
+
clustered_digest_section
|
297 |
+
]
|
298 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
|
300 |
if __name__ == "__main__":
|
301 |
+
demo.launch()
|
cluster_news.py
CHANGED
@@ -1,224 +1,169 @@
|
|
|
|
|
|
|
|
|
|
1 |
import numpy as np
|
2 |
import pandas as pd
|
3 |
-
from sklearn.cluster import KMeans
|
4 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
-
from sklearn.decomposition import LatentDirichletAllocation
|
6 |
-
from sklearn.metrics import silhouette_score
|
7 |
from collections import defaultdict
|
8 |
from sentence_transformers import SentenceTransformer
|
9 |
-
from sklearn.
|
10 |
-
|
|
|
|
|
|
|
11 |
|
12 |
def generate_embeddings(df, content_column):
|
13 |
-
"""
|
14 |
-
Generate embeddings for the content using SentenceTransformer.
|
15 |
-
"""
|
16 |
-
print("🔢 Generating embeddings for clustering...")
|
17 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
18 |
embeddings = model.encode(df[content_column].tolist(), show_progress_bar=True)
|
19 |
-
return embeddings
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
except ValueError as e:
|
48 |
-
print(f"Skipping {n_clusters} clusters due to error: {e}")
|
49 |
-
|
50 |
-
print(f"✅ Optimum number of clusters determined: {best_num_clusters}")
|
51 |
-
return best_num_clusters
|
52 |
-
|
53 |
-
|
54 |
-
def cluster_embeddings(embeddings, num_clusters):
|
55 |
-
"""
|
56 |
-
Perform KMeans clustering on the embeddings.
|
57 |
-
"""
|
58 |
-
print(f"📊 Clustering articles into {num_clusters} clusters using KMeans...")
|
59 |
-
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
|
60 |
-
kmeans.fit(embeddings)
|
61 |
-
return kmeans.labels_, kmeans
|
62 |
-
|
63 |
-
|
64 |
-
def extract_tfidf_labels(df, content_column, cluster_labels):
|
65 |
-
"""
|
66 |
-
Extract top TF-IDF keywords for each cluster.
|
67 |
-
"""
|
68 |
-
print("🔠 Extracting TF-IDF-based keywords for cluster labels...")
|
69 |
grouped = defaultdict(list)
|
70 |
for idx, label in enumerate(cluster_labels):
|
|
|
71 |
grouped[label].append(df.iloc[idx][content_column])
|
72 |
-
|
73 |
tfidf_labels = {}
|
74 |
for cluster_id, texts in grouped.items():
|
75 |
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=50)
|
76 |
tfidf_matrix = vectorizer.fit_transform(texts)
|
77 |
avg_tfidf = tfidf_matrix.mean(axis=0).A1
|
78 |
-
|
|
|
|
|
|
|
79 |
top_terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
|
80 |
-
tfidf_labels[cluster_id] =
|
81 |
-
|
82 |
return tfidf_labels
|
83 |
|
84 |
-
def
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
def filter_similar_topics(topic_keywords_list, threshold=0.75):
|
113 |
-
"""
|
114 |
-
Filter out similar topics based on cosine similarity of their embeddings.
|
115 |
-
"""
|
116 |
-
print("🔄 Filtering similar topics...")
|
117 |
-
model = SentenceTransformer('all-MiniLM-L6-v2')
|
118 |
-
topic_sentences = [", ".join(kw) for kw in topic_keywords_list]
|
119 |
-
embeddings = model.encode(topic_sentences)
|
120 |
-
unique_indices = []
|
121 |
-
for i, emb in enumerate(embeddings):
|
122 |
-
if all(cosine_similarity([emb], [embeddings[j]])[0][0] < threshold for j in unique_indices):
|
123 |
-
unique_indices.append(i)
|
124 |
-
return [topic_keywords_list[i] for i in unique_indices]
|
125 |
-
|
126 |
-
|
127 |
-
def get_representative_summaries(df, summary_column, embeddings, cluster_labels, kmeans):
|
128 |
-
"""
|
129 |
-
Get the most representative summary for each cluster based on proximity to the cluster centroid.
|
130 |
-
"""
|
131 |
-
print("🔄 Refining cluster labels using representative summaries...")
|
132 |
-
representatives = {}
|
133 |
-
for i in range(kmeans.n_clusters):
|
134 |
-
indices = [j for j, label in enumerate(cluster_labels) if label == i]
|
135 |
-
if not indices:
|
136 |
continue
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
if df.empty:
|
152 |
-
print("No articles to cluster.")
|
153 |
return None
|
154 |
|
155 |
-
|
156 |
-
embeddings = generate_embeddings(df, content_column)
|
157 |
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
cluster_labels, kmeans = cluster_embeddings(embeddings, num_clusters)
|
163 |
-
df['cluster_label'] = cluster_labels
|
164 |
|
165 |
-
|
166 |
-
print("🔠 Extracting TF-IDF matrix for clusters...")
|
167 |
-
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=5000)
|
168 |
-
tfidf_matrix = vectorizer.fit_transform(df[content_column].tolist())
|
169 |
-
feature_names = vectorizer.get_feature_names_out()
|
170 |
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
# Rank topics by importance
|
201 |
-
ranked_topics = [x for _, x in sorted(zip(topic_weights, topics), reverse=True)]
|
202 |
-
|
203 |
-
# Generate Primary focus and Related topics
|
204 |
-
primary_focus = ranked_topics[0] if ranked_topics else "N/A"
|
205 |
-
related_topics = ranked_topics[1:] if len(ranked_topics) > 1 else []
|
206 |
-
|
207 |
-
# Store detected topics for user display
|
208 |
-
detected_topics[cluster_label_tfidf] = {
|
209 |
-
"primary_focus": primary_focus,
|
210 |
-
"related_topics": related_topics,
|
211 |
}
|
|
|
|
|
212 |
|
213 |
-
# Assign the TF-IDF keywords as the cluster label
|
214 |
-
refined_labels[cluster_id] = cluster_label_tfidf
|
215 |
-
|
216 |
-
# Assign refined labels to clusters
|
217 |
-
df['cluster_label'] = [refined_labels[label] for label in cluster_labels]
|
218 |
-
|
219 |
-
print("✅ Clustering and labeling complete!")
|
220 |
return {
|
221 |
"dataframe": df,
|
222 |
"detected_topics": detected_topics,
|
223 |
-
"number_of_clusters":
|
|
|
|
|
224 |
}
|
|
|
1 |
+
# cluster_news.py
|
2 |
+
# Clusters news articles using HDBSCAN, labels clusters with TF-IDF n-grams and LDA topics,
|
3 |
+
# and falls back to a representative summary if the label is too vague.
|
4 |
+
|
5 |
import numpy as np
|
6 |
import pandas as pd
|
|
|
|
|
|
|
|
|
7 |
from collections import defaultdict
|
8 |
from sentence_transformers import SentenceTransformer
|
9 |
+
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
10 |
+
from sklearn.metrics.pairwise import cosine_distances
|
11 |
+
from sklearn.decomposition import LatentDirichletAllocation
|
12 |
+
import hdbscan
|
13 |
+
import umap
|
14 |
|
15 |
def generate_embeddings(df, content_column):
|
|
|
|
|
|
|
|
|
16 |
model = SentenceTransformer('all-MiniLM-L6-v2')
|
17 |
embeddings = model.encode(df[content_column].tolist(), show_progress_bar=True)
|
18 |
+
return np.array(embeddings)
|
19 |
+
|
20 |
+
def reduce_dimensions(embeddings, n_neighbors=10, min_dist=0.0, n_components=5, random_state=42):
|
21 |
+
n_samples = embeddings.shape[0]
|
22 |
+
if n_samples < 3:
|
23 |
+
return embeddings
|
24 |
+
n_components = min(max(2, n_components), n_samples - 2)
|
25 |
+
n_neighbors = min(max(2, n_neighbors), n_samples - 1)
|
26 |
+
reducer = umap.UMAP(
|
27 |
+
n_neighbors=n_neighbors,
|
28 |
+
min_dist=min_dist,
|
29 |
+
n_components=n_components,
|
30 |
+
random_state=random_state,
|
31 |
+
metric='cosine'
|
32 |
+
)
|
33 |
+
reduced = reducer.fit_transform(embeddings)
|
34 |
+
return reduced
|
35 |
+
|
36 |
+
def cluster_with_hdbscan(embeddings, min_cluster_size=2, min_samples=1):
|
37 |
+
clusterer = hdbscan.HDBSCAN(
|
38 |
+
min_cluster_size=min_cluster_size,
|
39 |
+
min_samples=min_samples,
|
40 |
+
metric='euclidean'
|
41 |
+
)
|
42 |
+
labels = clusterer.fit_predict(embeddings)
|
43 |
+
return labels, clusterer
|
44 |
+
|
45 |
+
def extract_tfidf_labels(df, content_column, cluster_labels, top_n=6):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
grouped = defaultdict(list)
|
47 |
for idx, label in enumerate(cluster_labels):
|
48 |
+
if label == -1: continue
|
49 |
grouped[label].append(df.iloc[idx][content_column])
|
|
|
50 |
tfidf_labels = {}
|
51 |
for cluster_id, texts in grouped.items():
|
52 |
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=50)
|
53 |
tfidf_matrix = vectorizer.fit_transform(texts)
|
54 |
avg_tfidf = tfidf_matrix.mean(axis=0).A1
|
55 |
+
if len(avg_tfidf) == 0:
|
56 |
+
tfidf_labels[cluster_id] = []
|
57 |
+
continue
|
58 |
+
top_indices = np.argsort(avg_tfidf)[::-1][:top_n]
|
59 |
top_terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
|
60 |
+
tfidf_labels[cluster_id] = top_terms
|
|
|
61 |
return tfidf_labels
|
62 |
|
63 |
+
def lda_topic_modeling(texts, n_topics=1, n_words=6):
|
64 |
+
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000)
|
65 |
+
X = vectorizer.fit_transform(texts)
|
66 |
+
if X.shape[0] < n_topics:
|
67 |
+
n_topics = max(1, X.shape[0])
|
68 |
+
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
|
69 |
+
lda.fit(X)
|
70 |
+
topic_words = []
|
71 |
+
for topic_idx, topic in enumerate(lda.components_):
|
72 |
+
top_indices = topic.argsort()[:-n_words - 1:-1]
|
73 |
+
words = [vectorizer.get_feature_names_out()[i] for i in top_indices]
|
74 |
+
topic_words.extend(words)
|
75 |
+
return topic_words
|
76 |
+
|
77 |
+
def get_representative_summary(df, cluster_indices, embeddings, centroid):
|
78 |
+
cluster_embs = embeddings[cluster_indices]
|
79 |
+
dists = cosine_distances(cluster_embs, centroid.reshape(1, -1)).flatten()
|
80 |
+
min_idx = np.argmin(dists)
|
81 |
+
return df.iloc[cluster_indices[min_idx]]["summary"]
|
82 |
+
|
83 |
+
def label_clusters_hybrid(df, content_column, summary_column, cluster_labels, embeddings, tfidf_labels, lda_labels, vague_threshold=15):
|
84 |
+
cluster_label_map = {}
|
85 |
+
cluster_primary_topics = {}
|
86 |
+
cluster_related_topics = {}
|
87 |
+
for cluster_id in set(cluster_labels):
|
88 |
+
if cluster_id == -1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
continue
|
90 |
+
topics = lda_labels.get(cluster_id, []) or tfidf_labels.get(cluster_id, [])
|
91 |
+
topics = [t for t in topics if t]
|
92 |
+
primary_topics = topics[:3]
|
93 |
+
related_topics = topics[3:]
|
94 |
+
label = ", ".join(primary_topics) if primary_topics else ""
|
95 |
+
if not label or len(label) < vague_threshold:
|
96 |
+
cluster_indices = np.where(cluster_labels == cluster_id)[0]
|
97 |
+
centroid = embeddings[cluster_indices].mean(axis=0)
|
98 |
+
rep_summary = get_representative_summary(df, cluster_indices, embeddings, centroid)
|
99 |
+
label = rep_summary[:80] + "..." if len(rep_summary) > 80 else rep_summary
|
100 |
+
cluster_label_map[cluster_id] = label
|
101 |
+
cluster_primary_topics[cluster_id] = primary_topics
|
102 |
+
cluster_related_topics[cluster_id] = related_topics
|
103 |
+
return cluster_label_map, cluster_primary_topics, cluster_related_topics
|
104 |
+
|
105 |
+
def cluster_and_label_articles(
|
106 |
+
df,
|
107 |
+
content_column="content",
|
108 |
+
summary_column="summary",
|
109 |
+
min_cluster_size=2,
|
110 |
+
min_samples=1,
|
111 |
+
n_neighbors=10,
|
112 |
+
min_dist=0.0,
|
113 |
+
n_components=5,
|
114 |
+
top_n=6,
|
115 |
+
lda_n_topics=1,
|
116 |
+
lda_n_words=6,
|
117 |
+
vague_threshold=15
|
118 |
+
):
|
119 |
if df.empty:
|
|
|
120 |
return None
|
121 |
|
122 |
+
min_cluster_size = max(2, min(min_cluster_size, len(df) // 2)) if len(df) < 20 else min_cluster_size
|
|
|
123 |
|
124 |
+
embeddings = generate_embeddings(df, content_column)
|
125 |
+
reduced_embeddings = reduce_dimensions(embeddings, n_neighbors, min_dist, n_components)
|
126 |
+
cluster_labels, clusterer = cluster_with_hdbscan(reduced_embeddings, min_cluster_size, min_samples)
|
127 |
+
df['cluster_id'] = cluster_labels
|
|
|
|
|
128 |
|
129 |
+
tfidf_labels = extract_tfidf_labels(df, content_column, cluster_labels, top_n=top_n)
|
|
|
|
|
|
|
|
|
130 |
|
131 |
+
lda_labels = {}
|
132 |
+
for cluster_id in set(cluster_labels):
|
133 |
+
if cluster_id == -1:
|
134 |
+
continue
|
135 |
+
cluster_texts = df[cluster_labels == cluster_id][content_column].tolist()
|
136 |
+
if cluster_texts:
|
137 |
+
topics = lda_topic_modeling(
|
138 |
+
cluster_texts, n_topics=lda_n_topics, n_words=lda_n_words
|
139 |
+
)
|
140 |
+
lda_labels[cluster_id] = topics
|
141 |
+
else:
|
142 |
+
lda_labels[cluster_id] = []
|
143 |
+
|
144 |
+
cluster_label_map, cluster_primary_topics, cluster_related_topics = label_clusters_hybrid(
|
145 |
+
df, content_column, summary_column, cluster_labels, embeddings, tfidf_labels, lda_labels, vague_threshold=vague_threshold
|
146 |
+
)
|
147 |
+
|
148 |
+
df['cluster_label'] = [
|
149 |
+
cluster_label_map.get(cid, "Noise/Other") if cid != -1 else "Noise/Other"
|
150 |
+
for cid in cluster_labels
|
151 |
+
]
|
152 |
+
df['lda_topics'] = [
|
153 |
+
", ".join(lda_labels.get(cid, [])) if cid != -1 else "" for cid in cluster_labels
|
154 |
+
]
|
155 |
+
|
156 |
+
detected_topics = {
|
157 |
+
label: {
|
158 |
+
"size": int((df['cluster_label'] == label).sum())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
}
|
160 |
+
for label in set(df['cluster_label']) if label != "Noise/Other"
|
161 |
+
}
|
162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
return {
|
164 |
"dataframe": df,
|
165 |
"detected_topics": detected_topics,
|
166 |
+
"number_of_clusters": len(detected_topics),
|
167 |
+
"cluster_primary_topics": cluster_primary_topics,
|
168 |
+
"cluster_related_topics": cluster_related_topics
|
169 |
}
|
extract_news.py
CHANGED
@@ -1,244 +1,36 @@
|
|
1 |
# extract_news.py
|
|
|
2 |
|
3 |
-
# This script is designed to extract news articles from various sources, including NewsAPI and Google News RSS using the URLs saved from the gather_news.py file.
|
4 |
-
# It includes functions for extracting clean,full-text content from the articles, and storing the metadata into a file.
|
5 |
-
|
6 |
-
|
7 |
-
# Article Scraping & Text Extraction
|
8 |
-
|
9 |
-
from newspaper import Article
|
10 |
-
import pandas as pd
|
11 |
import logging
|
12 |
-
import
|
13 |
-
from
|
14 |
-
|
15 |
-
|
16 |
-
# * For each URL from NewsAPI or RSS, * Create Article(url)* Call .download(), .parse(), .text and * Optionally use .nlp() to get summary and keywords
|
17 |
|
18 |
-
def extract_full_content(url, min_length=
|
19 |
-
"""
|
20 |
-
Extract full content and title from the given URL using newspaper3k.
|
21 |
-
Always returns a tuple (content, title) or (None, None).
|
22 |
-
"""
|
23 |
try:
|
24 |
article = Article(url)
|
25 |
article.download()
|
26 |
article.parse()
|
27 |
-
|
28 |
text = article.text.strip()
|
29 |
title = article.title.strip() if article.title else "Untitled"
|
30 |
-
|
31 |
-
# Filter out short content
|
32 |
if len(text) < min_length:
|
33 |
logging.warning(f"Extracted content is too short from {url}.")
|
34 |
-
return None
|
35 |
-
|
36 |
-
return text, title
|
37 |
-
|
38 |
except Exception as e:
|
39 |
logging.error(f"Failed to extract content from {url}: {str(e)}")
|
40 |
-
return None
|
41 |
-
|
42 |
-
|
43 |
-
def extract_full_content_rss(url, min_length=300):
|
44 |
-
"""
|
45 |
-
Extract full content and title from an RSS article using BeautifulSoup.
|
46 |
-
Always returns a tuple: (text, title) or (None, None).
|
47 |
-
"""
|
48 |
-
try:
|
49 |
-
response = requests.get(url, timeout=10)
|
50 |
-
if response.status_code != 200:
|
51 |
-
logging.error(f"Error fetching URL {url}: {response.status_code}")
|
52 |
-
return None, None
|
53 |
-
|
54 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
55 |
-
title = soup.title.string.strip() if soup.title and soup.title.string else "Untitled"
|
56 |
-
paragraphs = soup.find_all('p')
|
57 |
-
text = ' '.join([para.get_text() for para in paragraphs]).strip()
|
58 |
-
|
59 |
-
if len(text) < min_length:
|
60 |
-
logging.warning(f"Extracted content is too short from {url}.")
|
61 |
-
return None, None
|
62 |
-
|
63 |
-
return text, title
|
64 |
-
|
65 |
-
except Exception as e:
|
66 |
-
logging.error(f"Error extracting content from {url}: {str(e)}")
|
67 |
-
return None, None
|
68 |
-
|
69 |
-
|
70 |
-
# * Handle common edge cases such as * Paywalled content (skip or tag) and * Duplicate links or broken URLs
|
71 |
-
def is_paywalled(url):
|
72 |
-
"""
|
73 |
-
* Check if the URL is paywalled
|
74 |
-
"""
|
75 |
-
paywall_indicators = ['paywall', 'subscription', 'premium']
|
76 |
-
return any(indicator in url for indicator in paywall_indicators)
|
77 |
-
|
78 |
-
def is_paywalled_content(article):
|
79 |
-
"""
|
80 |
-
* Check if the article is paywalled
|
81 |
-
"""
|
82 |
-
if not article:
|
83 |
-
return False
|
84 |
-
if not article.get("text"):
|
85 |
-
return False
|
86 |
-
if is_paywalled(article.get("url", "")):
|
87 |
-
return True
|
88 |
-
return False
|
89 |
|
90 |
-
def
|
91 |
-
"""
|
92 |
-
* Check if the URL is a duplicate
|
93 |
-
"""
|
94 |
-
return url in existing_urls
|
95 |
-
|
96 |
-
def is_broken(url):
|
97 |
-
"""
|
98 |
-
* Check if the URL is broken
|
99 |
-
"""
|
100 |
-
try:
|
101 |
-
response = requests.head(url, allow_redirects=True)
|
102 |
-
return response.status_code != 200
|
103 |
-
except requests.RequestException:
|
104 |
-
return True
|
105 |
-
|
106 |
-
def is_valid_url(url):
|
107 |
-
"""
|
108 |
-
* Check if the URL is valid
|
109 |
-
"""
|
110 |
-
regex = re.compile(
|
111 |
-
r'^(?:http|ftp)s?://' # http:// or https://
|
112 |
-
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
|
113 |
-
r'localhost|' # localhost...
|
114 |
-
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
|
115 |
-
r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
|
116 |
-
r'(?::\d+)?' # optional port
|
117 |
-
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
|
118 |
-
return re.match(regex, url) is not None
|
119 |
-
|
120 |
-
def is_valid_url_content(url):
|
121 |
-
"""
|
122 |
-
* Check if the URL is valid
|
123 |
-
"""
|
124 |
-
if not url:
|
125 |
-
return False
|
126 |
-
if not is_valid_url(url):
|
127 |
-
return False
|
128 |
-
if is_paywalled(url):
|
129 |
-
return False
|
130 |
-
if is_broken(url):
|
131 |
-
return False
|
132 |
-
return True
|
133 |
-
|
134 |
-
# Additional functions to check if the article have empty content or blocked sites
|
135 |
-
|
136 |
-
def is_empty_content(article):
|
137 |
-
"""
|
138 |
-
* Check if the article content is empty
|
139 |
-
"""
|
140 |
-
if not article:
|
141 |
-
return True
|
142 |
-
if not article.get("text"):
|
143 |
-
return True
|
144 |
-
return False
|
145 |
-
|
146 |
-
def is_blocked_site(url):
|
147 |
-
"""
|
148 |
-
* Check if the URL is from a blocked site
|
149 |
-
"""
|
150 |
-
blocked_sites = ['example.com', 'blockedsite.com'] # Add your blocked sites here
|
151 |
-
return any(blocked_site in url for blocked_site in blocked_sites)
|
152 |
-
|
153 |
-
def is_blocked_content(article):
|
154 |
-
"""
|
155 |
-
* Check if the article is from a blocked site
|
156 |
-
"""
|
157 |
-
if not article:
|
158 |
-
return False
|
159 |
-
if not article.get("text"):
|
160 |
-
return False
|
161 |
-
if is_blocked_site(article.get("url", "")):
|
162 |
-
return True
|
163 |
-
return False
|
164 |
-
|
165 |
-
# Extract news articles from the given URLs
|
166 |
-
|
167 |
-
def extract_news_articles(urls):
|
168 |
-
"""
|
169 |
-
* Extract news articles from the given URLs
|
170 |
-
"""
|
171 |
extracted_articles = []
|
172 |
-
existing_urls = set()
|
173 |
-
|
174 |
for url in urls:
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
logging.warning(f"Skipping duplicate URL: {url}")
|
180 |
-
continue
|
181 |
-
existing_urls.add(url)
|
182 |
-
|
183 |
-
article = extract_full_content(url)
|
184 |
-
if not article:
|
185 |
-
logging.warning(f"Failed to extract content from {url}")
|
186 |
-
continue
|
187 |
-
|
188 |
-
if is_paywalled_content(article):
|
189 |
-
logging.warning(f"Skipping paywalled content from URL: {url}")
|
190 |
-
continue
|
191 |
-
|
192 |
-
extracted_articles.append(article)
|
193 |
-
|
194 |
-
return extracted_articles
|
195 |
-
|
196 |
-
def extract_news_articles_rss(urls):
|
197 |
-
"""
|
198 |
-
* Extract news articles from the given RSS URLs
|
199 |
-
"""
|
200 |
-
extracted_articles = []
|
201 |
-
existing_urls = set()
|
202 |
-
|
203 |
-
for url in urls:
|
204 |
-
if not is_valid_url_content(url):
|
205 |
-
logging.warning(f"Skipping invalid or paywalled URL: {url}")
|
206 |
-
continue
|
207 |
-
if is_duplicate(url, existing_urls):
|
208 |
-
logging.warning(f"Skipping duplicate URL: {url}")
|
209 |
-
continue
|
210 |
-
existing_urls.add(url)
|
211 |
-
|
212 |
-
article = extract_full_content_rss(url)
|
213 |
-
if not article:
|
214 |
-
logging.warning(f"Failed to extract content from {url}")
|
215 |
-
continue
|
216 |
-
|
217 |
-
if is_paywalled_content(article):
|
218 |
-
logging.warning(f"Skipping paywalled content from URL: {url}")
|
219 |
-
continue
|
220 |
-
|
221 |
-
extracted_articles.append(article)
|
222 |
-
|
223 |
return extracted_articles
|
224 |
|
225 |
-
# Metadata Structuring and Storage
|
226 |
-
# Functions to create a dataframe with all the metadata for extracted fields title,url,source,author, published_at and full_text for each extracted article and save it to a csv file
|
227 |
-
|
228 |
def create_dataframe(articles):
|
229 |
-
"""
|
230 |
-
Create a pandas DataFrame from the list of articles.
|
231 |
-
"""
|
232 |
return pd.DataFrame(articles)
|
233 |
|
234 |
def save_to_csv(df, filename):
|
235 |
-
|
236 |
-
Save the DataFrame to a CSV file.
|
237 |
-
"""
|
238 |
-
df.to_csv(filename, index=False)
|
239 |
-
|
240 |
-
def save_to_json(df, filename):
|
241 |
-
"""
|
242 |
-
Save the DataFrame to a JSON file.
|
243 |
-
"""
|
244 |
-
df.to_json(filename, orient="records", lines=True)
|
|
|
1 |
# extract_news.py
|
2 |
+
# This script extracts full content from news articles using the newspaper3k library.
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import logging
|
5 |
+
import pandas as pd
|
6 |
+
from newspaper import Article
|
|
|
|
|
|
|
7 |
|
8 |
+
def extract_full_content(url, min_length=100):
|
|
|
|
|
|
|
|
|
9 |
try:
|
10 |
article = Article(url)
|
11 |
article.download()
|
12 |
article.parse()
|
|
|
13 |
text = article.text.strip()
|
14 |
title = article.title.strip() if article.title else "Untitled"
|
|
|
|
|
15 |
if len(text) < min_length:
|
16 |
logging.warning(f"Extracted content is too short from {url}.")
|
17 |
+
return None
|
18 |
+
return {"url": url, "text": text, "title": title}
|
|
|
|
|
19 |
except Exception as e:
|
20 |
logging.error(f"Failed to extract content from {url}: {str(e)}")
|
21 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
+
def extract_news_articles(urls, min_length=100):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
extracted_articles = []
|
|
|
|
|
25 |
for url in urls:
|
26 |
+
article = extract_full_content(url, min_length=min_length)
|
27 |
+
if article and article.get("text"):
|
28 |
+
article["original_url"] = url
|
29 |
+
extracted_articles.append(article)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
return extracted_articles
|
31 |
|
|
|
|
|
|
|
32 |
def create_dataframe(articles):
|
|
|
|
|
|
|
33 |
return pd.DataFrame(articles)
|
34 |
|
35 |
def save_to_csv(df, filename):
|
36 |
+
df.to_csv(filename, index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gather_news.py
CHANGED
@@ -1,73 +1,121 @@
|
|
1 |
# gather_news.py
|
2 |
-
|
3 |
-
|
4 |
# News Source Integration
|
5 |
-
# This script integrates with various news sources to fetch the latest articles from the specified news sources,
|
|
|
6 |
|
7 |
import requests
|
8 |
-
import
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
def
|
12 |
-
|
13 |
-
Fetch articles from NewsAPI based on the provided topic.
|
14 |
-
"""
|
15 |
url = 'https://newsapi.org/v2/everything'
|
16 |
-
api_key = os.environ.get("api_key") # Make sure the key name matches what's in HF settings
|
17 |
-
if not api_key:
|
18 |
-
raise ValueError("API_KEY is not set in environment variables.")
|
19 |
params = {
|
20 |
-
'apiKey': api_key,
|
21 |
'language': 'en',
|
22 |
'q': topic,
|
23 |
-
'pageSize':
|
|
|
24 |
}
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
return
|
73 |
-
|
|
|
|
1 |
# gather_news.py
|
|
|
|
|
2 |
# News Source Integration
|
3 |
+
# This script integrates with various news sources to fetch the latest articles from the specified news sources,
|
4 |
+
# extracts relevant information such as title, URL, Source, Author and Publish date, and extracts full content.
|
5 |
|
6 |
import requests
|
7 |
+
from extract_news import extract_news_articles, create_dataframe, save_to_csv
|
8 |
+
|
9 |
+
def fetch_newsapi_top_headlines(min_length=100, max_articles=30):
|
10 |
+
import config
|
11 |
+
url = 'https://newsapi.org/v2/top-headlines'
|
12 |
+
params = {
|
13 |
+
'apiKey': config.api_key,
|
14 |
+
'language': 'en',
|
15 |
+
'pageSize': max_articles
|
16 |
+
}
|
17 |
+
response = requests.get(url, params=params)
|
18 |
+
if response.status_code != 200:
|
19 |
+
print(f"Error: Failed to fetch news from NewsAPI Top Headlines. Status code: {response.status_code}")
|
20 |
+
return []
|
21 |
+
articles = response.json().get("articles", [])
|
22 |
+
if not articles:
|
23 |
+
print("No articles found in NewsAPI Top Headlines.")
|
24 |
+
return []
|
25 |
+
meta_by_url = {}
|
26 |
+
urls = []
|
27 |
+
for article in articles:
|
28 |
+
url = article.get("url", "#")
|
29 |
+
meta = {
|
30 |
+
"url": url,
|
31 |
+
"title": article.get("title", ""),
|
32 |
+
"source": article.get("source", {}).get("name", ""),
|
33 |
+
"author": article.get("author", "Unknown"),
|
34 |
+
"publishedAt": article.get("publishedAt", "Unknown"),
|
35 |
+
}
|
36 |
+
meta_by_url[url] = meta
|
37 |
+
urls.append(url)
|
38 |
+
print(f"Fetched {len(urls)} article URLs from NewsAPI Top Headlines.")
|
39 |
+
extracted_articles = extract_news_articles(urls, min_length=min_length)
|
40 |
+
merged_articles = []
|
41 |
+
for art in extracted_articles:
|
42 |
+
meta = meta_by_url.get(art.get("original_url"))
|
43 |
+
if not meta:
|
44 |
+
meta = {
|
45 |
+
"title": art.get("title", "Untitled"),
|
46 |
+
"source": "",
|
47 |
+
"author": "Unknown",
|
48 |
+
"publishedAt": "Unknown"
|
49 |
+
}
|
50 |
+
merged = {
|
51 |
+
"url": art.get("url"),
|
52 |
+
"title": art.get("title") if art.get("title") and art.get("title") != "Untitled" else meta["title"],
|
53 |
+
"source": meta["source"],
|
54 |
+
"author": meta["author"],
|
55 |
+
"publishedAt": meta["publishedAt"],
|
56 |
+
"text": art.get("text", ""),
|
57 |
+
}
|
58 |
+
merged_articles.append(merged)
|
59 |
+
print(f"Usable articles after extraction (NewsAPI Top Headlines): {len(merged_articles)}")
|
60 |
+
return merged_articles
|
61 |
|
62 |
+
def fetch_newsapi_everything(topic, min_length=100, max_articles=50):
|
63 |
+
import config
|
|
|
|
|
64 |
url = 'https://newsapi.org/v2/everything'
|
|
|
|
|
|
|
65 |
params = {
|
66 |
+
'apiKey': config.api_key,
|
67 |
'language': 'en',
|
68 |
'q': topic,
|
69 |
+
'pageSize': max_articles,
|
70 |
+
'sortBy': 'publishedAt'
|
71 |
}
|
72 |
+
response = requests.get(url, params=params)
|
73 |
+
if response.status_code != 200:
|
74 |
+
print(f"Error: Failed to fetch news from NewsAPI Everything. Status code: {response.status_code}")
|
75 |
+
return []
|
76 |
+
articles = response.json().get("articles", [])
|
77 |
+
if not articles:
|
78 |
+
print("No articles found in NewsAPI Everything.")
|
79 |
+
return []
|
80 |
+
meta_by_url = {}
|
81 |
+
urls = []
|
82 |
+
for article in articles:
|
83 |
+
url = article.get("url", "#")
|
84 |
+
meta = {
|
85 |
+
"url": url,
|
86 |
+
"title": article.get("title", ""),
|
87 |
+
"source": article.get("source", {}).get("name", ""),
|
88 |
+
"author": article.get("author", "Unknown"),
|
89 |
+
"publishedAt": article.get("publishedAt", "Unknown"),
|
90 |
+
}
|
91 |
+
meta_by_url[url] = meta
|
92 |
+
urls.append(url)
|
93 |
+
print(f"Fetched {len(urls)} article URLs from NewsAPI Everything.")
|
94 |
+
extracted_articles = extract_news_articles(urls, min_length=min_length)
|
95 |
+
merged_articles = []
|
96 |
+
for art in extracted_articles:
|
97 |
+
meta = meta_by_url.get(art.get("original_url"))
|
98 |
+
if not meta:
|
99 |
+
meta = {
|
100 |
+
"title": art.get("title", "Untitled"),
|
101 |
+
"source": "",
|
102 |
+
"author": "Unknown",
|
103 |
+
"publishedAt": "Unknown"
|
104 |
+
}
|
105 |
+
merged = {
|
106 |
+
"url": art.get("url"),
|
107 |
+
"title": art.get("title") if art.get("title") and art.get("title") != "Untitled" else meta["title"],
|
108 |
+
"source": meta["source"],
|
109 |
+
"author": meta["author"],
|
110 |
+
"publishedAt": meta["publishedAt"],
|
111 |
+
"text": art.get("text", ""),
|
112 |
+
}
|
113 |
+
merged_articles.append(merged)
|
114 |
+
print(f"Usable articles after extraction (NewsAPI Everything): {len(merged_articles)}")
|
115 |
+
return merged_articles
|
116 |
|
117 |
+
def fetch_articles(topic=None, min_length=100, max_articles=30):
|
118 |
+
if topic and topic.strip():
|
119 |
+
return fetch_newsapi_everything(topic, min_length=min_length, max_articles=max_articles)
|
120 |
+
else:
|
121 |
+
return fetch_newsapi_top_headlines(min_length=min_length, max_articles=max_articles)
|
input_topic.py
CHANGED
@@ -1,7 +1,4 @@
|
|
1 |
# input_topic.py
|
2 |
-
|
3 |
-
|
4 |
-
# Input Design
|
5 |
# This script is designed to take user input for a topic or a keyword and validate it before using it in a news summarization application.
|
6 |
|
7 |
def get_topic():
|
|
|
1 |
# input_topic.py
|
|
|
|
|
|
|
2 |
# This script is designed to take user input for a topic or a keyword and validate it before using it in a news summarization application.
|
3 |
|
4 |
def get_topic():
|
requirements.txt
CHANGED
@@ -9,4 +9,7 @@ numpy
|
|
9 |
requests
|
10 |
gradio
|
11 |
lxml_html_clean
|
|
|
|
|
|
|
12 |
sentence_transformers
|
|
|
9 |
requests
|
10 |
gradio
|
11 |
lxml_html_clean
|
12 |
+
plotly.express
|
13 |
+
hdbscan
|
14 |
+
umap
|
15 |
sentence_transformers
|
summarizer.py
CHANGED
@@ -6,14 +6,6 @@ from transformers import pipeline
|
|
6 |
# Load summarization pipeline
|
7 |
summarizer = pipeline("summarization", model="harao-ml/flant5-finetuned-summarize")
|
8 |
|
9 |
-
# Load once globally
|
10 |
-
|
11 |
-
#from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
12 |
-
|
13 |
-
#tokenizer = AutoTokenizer.from_pretrained("flant5-base")
|
14 |
-
#model = AutoModelForSeq2SeqLM.from_pretrained("flant5-base")
|
15 |
-
#summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
|
16 |
-
|
17 |
# Function to split text into smaller chunks
|
18 |
def split_text(text, max_tokens=512):
|
19 |
words = text.split()
|
|
|
6 |
# Load summarization pipeline
|
7 |
summarizer = pipeline("summarization", model="harao-ml/flant5-finetuned-summarize")
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
# Function to split text into smaller chunks
|
10 |
def split_text(text, max_tokens=512):
|
11 |
words = text.split()
|