harao-ml commited on
Commit
97420da
·
verified ·
1 Parent(s): a445055

Upload 8 files

Browse files
Files changed (8) hide show
  1. analyze_sentiment.py +9 -13
  2. app.py +208 -174
  3. cluster_news.py +137 -192
  4. extract_news.py +13 -221
  5. gather_news.py +110 -62
  6. input_topic.py +0 -3
  7. requirements.txt +3 -0
  8. summarizer.py +0 -8
analyze_sentiment.py CHANGED
@@ -1,28 +1,24 @@
1
  # analyze_sentiment.py
2
-
3
  # This script analyzes the sentiment of the summarized content using the Hugging Face Transformers library.
4
 
5
-
6
  from transformers import pipeline
7
 
8
-
9
- # Load sentiment analysis pipeline
10
- sentiment_analyzer = pipeline("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")
11
 
12
  def analyze_summary(summary):
13
  """
14
- Analyze the sentiment of the given summary.
15
  Returns a tuple of (sentiment, score).
16
  """
17
  try:
18
  if not summary.strip():
19
  return "No input provided.", 0.0
20
-
21
- result = sentiment_analyzer(summary)[0]
22
- sentiment = result['label']
23
- score = result['score']
24
-
25
  return sentiment, score
26
  except Exception as e:
27
- return f"Error analyzing sentiment: {str(e)}", 0.0
28
- # Example usage
 
1
  # analyze_sentiment.py
 
2
  # This script analyzes the sentiment of the summarized content using the Hugging Face Transformers library.
3
 
 
4
  from transformers import pipeline
5
 
6
+ # Load zero-shot classification pipeline
7
+ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
 
8
 
9
  def analyze_summary(summary):
10
  """
11
+ Analyze the sentiment of the given summary using zero-shot classification.
12
  Returns a tuple of (sentiment, score).
13
  """
14
  try:
15
  if not summary.strip():
16
  return "No input provided.", 0.0
17
+
18
+ candidate_labels = ["positive", "neutral", "negative"]
19
+ result = classifier(summary, candidate_labels)
20
+ sentiment = result['labels'][0].capitalize()
21
+ score = float(result['scores'][0])
22
  return sentiment, score
23
  except Exception as e:
24
+ return f"Error analyzing sentiment: {str(e)}", 0.0
 
app.py CHANGED
@@ -1,110 +1,123 @@
1
- import gradio as gr
 
 
2
  import pandas as pd
3
  import cluster_news
4
- import extract_news
5
  import summarizer
6
  import analyze_sentiment
7
- import gather_news
 
 
8
 
9
- # ------------------ Utilities ------------------
 
 
 
 
 
 
10
 
11
- def fetch_content(topic):
12
- articles = gather_news.fetch_articles_newsapi(topic)
13
- if isinstance(articles, str):
14
- articles = gather_news.fetch_articles_google(topic)
15
- if isinstance(articles, str):
16
- return None
17
- try:
18
- articles = sorted(articles, key=lambda x: x.get("publishedAt", ""), reverse=True)[:10]
19
- except Exception:
20
- return None
21
- return articles
 
 
 
 
22
 
23
  def fetch_and_process_latest_news(sentiment_filters):
24
- topic = "Top Headlines"
25
- articles = gather_news.fetch_articles_newsapi("top headlines")
26
- if isinstance(articles, str) or not articles:
27
- return sentiment_filters, "### No latest news available", "", "", "", "", None
28
 
29
- articles = sorted(articles, key=lambda x: x.get("publishedAt", ""), reverse=True)[:10]
30
- extracted_articles = extract_summarize_and_analyze_articles(articles)
 
31
 
32
- if not extracted_articles:
33
- return sentiment_filters, "### No content to display", "", "", "", "", None
 
34
 
35
- df = pd.DataFrame(extracted_articles)
 
 
 
 
 
 
36
  result = cluster_news.cluster_and_label_articles(df, content_column="content", summary_column="summary")
37
- cluster_md_blocks = display_clusters_as_columns(result, sentiment_filters)
38
- csv_file, _ = save_clustered_articles(result["dataframe"], topic)
 
 
 
 
 
39
 
40
- return sentiment_filters, *cluster_md_blocks, csv_file
41
 
42
  def extract_summarize_and_analyze_articles(articles):
43
  extracted_articles = []
44
  for article in articles:
45
- url = article.get("url")
46
- if url:
47
- content, _ = extract_news.extract_full_content(url)
48
- if content:
49
- summary = summarizer.generate_summary(content)
50
- sentiment, score = analyze_sentiment.analyze_summary(summary)
51
- extracted_articles.append({
52
- "title": article.get("title", "No title"),
53
- "url": url,
54
- "source": article.get("source", "Unknown"),
55
- "author": article.get("author", "Unknown"),
56
- "publishedAt": article.get("publishedAt", "Unknown"),
57
- "content": content,
58
- "summary": summary,
59
- "sentiment": sentiment,
60
- "score": score
61
- })
62
  return extracted_articles
63
 
64
- def extract_summarize_and_analyze_content_from_file(files):
65
- extracted_articles = []
66
- for file in files:
67
- with open(file.name, "r", encoding="utf-8") as f:
68
- content = f.read()
69
- if content.strip():
70
- summary = summarizer.generate_summary(content)
71
- sentiment, score = analyze_sentiment.analyze_summary(summary)
72
- extracted_articles.append({
73
- "title": "Custom File",
74
- "url": "N/A",
75
- "source": "Uploaded File",
76
- "author": "Unknown",
77
- "publishedAt": "Unknown",
78
- "content": content,
79
- "summary": summary,
80
- "sentiment": sentiment,
81
- "score": score
82
- })
83
- return extracted_articles
 
 
 
 
84
 
85
  def extract_summarize_and_analyze_content_from_urls(urls):
86
- extracted_articles = []
87
- for url in urls:
88
- content, title = extract_news.extract_full_content(url)
89
- if content: # Only proceed if content is successfully extracted
90
- summary = summarizer.generate_summary(content)
91
- sentiment, score = analyze_sentiment.analyze_summary(summary)
92
- extracted_articles.append({
93
- "title": title if title else "Untitled Article",
94
- "url": url,
95
- "source": "External Link",
96
- "author": "Unknown",
97
- "publishedAt": "Unknown",
98
- "content": content,
99
- "summary": summary,
100
- "sentiment": sentiment,
101
- "score": score
102
- })
103
- return extracted_articles
104
 
105
- def display_clusters_as_columns(result, sentiment_filters=None):
106
  df = result["dataframe"]
107
- detected_topics = result.get("detected_topics", {})
 
108
  df["sentiment"] = df["sentiment"].str.capitalize()
109
 
110
  if sentiment_filters:
@@ -117,24 +130,50 @@ def display_clusters_as_columns(result, sentiment_filters=None):
117
  markdown_blocks = []
118
 
119
  for cluster_label, articles in clusters:
120
- cluster_md = f"### 🧩 Cluster {cluster_label}\n"
121
- if cluster_label in detected_topics:
122
- topics = detected_topics[cluster_label]
123
- cluster_md += f"**Primary Topic:** {topics['primary_focus']}\n\n"
124
- if topics["related_topics"]:
125
- cluster_md += f"**Related Topics:** {', '.join(topics['related_topics'])}\n\n"
126
- cluster_md += f"**Articles:** {len(articles)}\n\n"
127
- for _, article in articles.iterrows():
128
- cluster_md += (
129
- f"#### 📰 {article['title']}\n"
130
- f"- **Source:** {article['source']}\n"
131
- f"- **Sentiment:** {article['sentiment']}\n"
132
- f"<details><summary><strong>Summary</strong></summary>\n"
133
- f"{article['summary']}\n"
134
- f"</details>\n"
135
- f"- [Read Full Article]({article['url']})\n\n"
136
- )
137
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  markdown_blocks.append(cluster_md)
139
 
140
  while len(markdown_blocks) < 5:
@@ -149,88 +188,88 @@ def save_clustered_articles(df, topic):
149
  df.to_csv(csv_file, index=False)
150
  return csv_file, None
151
 
152
- # ------------------ Pipeline Trigger ------------------
153
-
154
- def update_ui_with_columns(topic, files, urls, sentiment_filters):
155
  extracted_articles = []
156
 
157
- if topic.strip():
158
- articles = fetch_content(topic)
159
- if articles:
160
- extracted_articles.extend(extract_summarize_and_analyze_articles(articles))
161
-
162
- if files:
163
- extracted_articles.extend(extract_summarize_and_analyze_content_from_file(files))
164
 
165
  if urls:
166
  url_list = [url.strip() for url in urls.split("\n") if url.strip()]
167
  extracted_articles.extend(extract_summarize_and_analyze_content_from_urls(url_list))
168
 
169
  if not extracted_articles:
170
- return sentiment_filters, "### No content to display", "", "", "", "", None
171
 
172
- df = pd.DataFrame(extracted_articles)
 
173
  result = cluster_news.cluster_and_label_articles(df, content_column="content", summary_column="summary")
174
- cluster_md_blocks = display_clusters_as_columns(result, sentiment_filters)
175
  csv_file, _ = save_clustered_articles(result["dataframe"], topic or "batch_upload")
176
-
177
- return sentiment_filters, *cluster_md_blocks, csv_file
 
 
178
 
179
  def clear_interface():
180
  return (
181
  "", # topic_input
182
  ["Positive", "Neutral", "Negative"],# sentiment_filter
183
- gr.update(value=None), # uploaded_files (reset file upload)
184
  "", # urls_input
185
  "", "", "", "", "", # cluster columns 0–4
186
- gr.update(value=None) # csv_output (reset download file)
 
 
187
  )
188
 
189
-
190
- # ------------------ Gradio UI ------------------
191
-
192
- with gr.Blocks(theme=gr.themes.Base(), css=".gr-markdown { margin: 10px; }") as demo:
193
-
194
- # Header Section
195
- gr.Markdown("# 📰 Quick Pulse")
196
- gr.Markdown("### AI-Powered News Summarization with Real-Time Sentiment and Topic Insights")
197
  gr.Markdown(
198
- "From headlines to insight, Quick Pulse summarizes news stories, captures emotional context, and clusters related topics to provide structured intelligence—faster than ever")
199
-
200
- # Input Section
201
- gr.Markdown("---") # Horizontal line for separation
202
- with gr.Accordion("🗞️ Latest Top Headlines", open=False):
203
- latest_news_button = gr.Button("Fetch & Summarize Top 10 Headlines")
204
-
205
- with gr.Row():
206
- topic_input = gr.Textbox(label="Enter Topic", placeholder="e.g. climate change")
207
- sentiment_filter = gr.CheckboxGroup(choices=["Positive", "Neutral", "Negative"], value=["Positive", "Neutral", "Negative"], label="Sentiment Filter")
208
- csv_output = gr.File(label="📁 Download Clustered Digest CSV")
209
-
210
- with gr.Accordion("📂 Upload Articles (.txt files)", open=False):
211
- uploaded_files = gr.File(label="Upload .txt Files", file_types=[".txt"], file_count="multiple")
212
-
213
- with gr.Accordion("🔗 Enter Multiple URLs", open=False):
214
- urls_input = gr.Textbox(label="Enter URLs (newline separated)", lines=4)
215
-
216
- with gr.Row():
217
- submit_button = gr.Button(" Generate Digest")
218
- clear_button = gr.Button(" Clear")
219
 
220
  with gr.Row():
221
- column_0 = gr.Markdown()
222
- column_1 = gr.Markdown()
223
- column_2 = gr.Markdown()
224
- column_3 = gr.Markdown()
225
- column_4 = gr.Markdown()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  submit_button.click(
228
  fn=update_ui_with_columns,
229
- inputs=[topic_input, uploaded_files, urls_input, sentiment_filter],
230
  outputs=[
231
  sentiment_filter,
232
  column_0, column_1, column_2, column_3, column_4,
233
- csv_output
 
 
234
  ]
235
  )
236
 
@@ -240,28 +279,23 @@ with gr.Blocks(theme=gr.themes.Base(), css=".gr-markdown { margin: 10px; }") as
240
  outputs=[
241
  sentiment_filter,
242
  column_0, column_1, column_2, column_3, column_4,
243
- csv_output
 
 
244
  ]
245
  )
246
 
247
  clear_button.click(
248
- fn=clear_interface,
249
- inputs=[],
250
- outputs=[
251
- topic_input, # 1
252
- sentiment_filter, # 2
253
- uploaded_files, # 3
254
- urls_input, # 4
255
- column_0, # 5
256
- column_1, # 6
257
- column_2, # 7
258
- column_3, # 8
259
- column_4, # 9
260
- csv_output # 10
261
- ]
262
- )
263
-
264
-
265
 
266
  if __name__ == "__main__":
267
- demo.launch()
 
1
+ ## This script provides a Gradio interface for gathering, clustering, summarizing, and analyzing news articles with sentiment analysis and topic modeling.
2
+
3
+ import gather_news
4
  import pandas as pd
5
  import cluster_news
 
6
  import summarizer
7
  import analyze_sentiment
8
+ import extract_news
9
+ import gradio as gr
10
+ import plotly.express as px
11
 
12
+ def plot_topic_frequency(result):
13
+ df = result["dataframe"]
14
+ topic_counts = df["cluster_label"].value_counts().reset_index()
15
+ topic_counts.columns = ["Topic", "Count"]
16
+ fig = px.bar(topic_counts, x="Topic", y="Count", title="Topic Frequency", color="Topic")
17
+ fig.update_layout(showlegend=False, height=350)
18
+ return fig
19
 
20
+ def plot_sentiment_trends(result):
21
+ df = result["dataframe"]
22
+ sentiment_counts = df["sentiment"].value_counts().reset_index()
23
+ sentiment_counts.columns = ["Sentiment", "Count"]
24
+ fig = px.pie(sentiment_counts, names="Sentiment", values="Count", title="Sentiment Distribution")
25
+ fig.update_traces(textinfo='label+percent')
26
+ fig.update_layout(height=350)
27
+ return fig
28
+
29
+ def render_top_clusters_table(result, top_n=5):
30
+ df = result["dataframe"]
31
+ cluster_counts = df["cluster_label"].value_counts().reset_index()
32
+ cluster_counts.columns = ["Cluster", "Articles"]
33
+ top_clusters = cluster_counts.head(top_n)
34
+ return top_clusters
35
 
36
  def fetch_and_process_latest_news(sentiment_filters):
37
+ articles = gather_news.fetch_newsapi_top_headlines()
38
+ return process_and_display_articles(articles, sentiment_filters, "Top Headlines")
 
 
39
 
40
+ def fetch_and_process_topic_news(topic, sentiment_filters):
41
+ articles = gather_news.fetch_newsapi_everything(topic)
42
+ return process_and_display_articles(articles, sentiment_filters, topic or "Topic")
43
 
44
+ def process_and_display_articles(articles, sentiment_filters, topic_label):
45
+ if not articles:
46
+ return sentiment_filters, "", "", "", "", "", None, None, None, gr.update(visible=False)
47
 
48
+ articles = sorted(articles, key=lambda x: x.get("publishedAt", ""), reverse=True)
49
+ extracted_articles = extract_summarize_and_analyze_articles(articles)
50
+ deduped_articles = deduplicate_articles(extracted_articles)
51
+ if not deduped_articles:
52
+ return sentiment_filters, "", "", "", "", "", None, None, None, gr.update(visible=False)
53
+
54
+ df = pd.DataFrame(deduped_articles)
55
  result = cluster_news.cluster_and_label_articles(df, content_column="content", summary_column="summary")
56
+ cluster_md_blocks = display_clusters_as_columns_grouped_by_sentiment(result, sentiment_filters)
57
+ csv_file, _ = save_clustered_articles(result["dataframe"], topic_label)
58
+
59
+ # Analytics
60
+ topic_fig = plot_topic_frequency(result)
61
+ sentiment_fig = plot_sentiment_trends(result)
62
+ top_clusters_table = render_top_clusters_table(result)
63
 
64
+ return sentiment_filters, *cluster_md_blocks, csv_file, topic_fig, sentiment_fig, top_clusters_table, gr.update(visible=True)
65
 
66
  def extract_summarize_and_analyze_articles(articles):
67
  extracted_articles = []
68
  for article in articles:
69
+ content = article.get("text") or article.get("content")
70
+ if not content:
71
+ continue
72
+ title = article.get("title", "No title")
73
+ summary = summarizer.generate_summary(content)
74
+ sentiment, score = analyze_sentiment.analyze_summary(summary)
75
+ extracted_articles.append({
76
+ "title": title,
77
+ "url": article.get("url"),
78
+ "source": article.get("source", "Unknown"),
79
+ "author": article.get("author", "Unknown"),
80
+ "publishedAt": article.get("publishedAt", "Unknown"),
81
+ "content": content,
82
+ "summary": summary,
83
+ "sentiment": sentiment,
84
+ "score": score
85
+ })
86
  return extracted_articles
87
 
88
+ def deduplicate_articles(articles):
89
+ seen_urls = set()
90
+ seen_title_source = set()
91
+ seen_title_summary = set()
92
+ deduped = []
93
+ for art in articles:
94
+ url = art.get("url")
95
+ title = art.get("title", "").strip().lower()
96
+ source = art.get("source", "").strip().lower()
97
+ summary = art.get("summary", "").strip().lower()
98
+ key_title_source = (title, source)
99
+ key_title_summary = (title, summary)
100
+ if url and url in seen_urls:
101
+ continue
102
+ if key_title_source in seen_title_source:
103
+ continue
104
+ if key_title_summary in seen_title_summary:
105
+ continue
106
+ deduped.append(art)
107
+ if url:
108
+ seen_urls.add(url)
109
+ seen_title_source.add(key_title_source)
110
+ seen_title_summary.add(key_title_summary)
111
+ return deduped
112
 
113
  def extract_summarize_and_analyze_content_from_urls(urls):
114
+ articles = extract_news.extract_news_articles(urls)
115
+ return extract_summarize_and_analyze_articles(articles)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
+ def display_clusters_as_columns_grouped_by_sentiment(result, sentiment_filters=None):
118
  df = result["dataframe"]
119
+ cluster_primary_topics = result.get("cluster_primary_topics", {})
120
+ cluster_related_topics = result.get("cluster_related_topics", {})
121
  df["sentiment"] = df["sentiment"].str.capitalize()
122
 
123
  if sentiment_filters:
 
130
  markdown_blocks = []
131
 
132
  for cluster_label, articles in clusters:
133
+ cluster_md = f"<div style='border:2px solid #e0e0e0; border-radius:10px; margin-bottom:18px; padding:18px; background: #f9f9fa;'>"
134
+ cluster_md += f"<h3 style='color:#2d6cdf;'>🧩 Cluster: {cluster_label}</h3>"
135
+
136
+ lda_topics = articles["lda_topics"].iloc[0] if "lda_topics" in articles else ""
137
+ if lda_topics:
138
+ cluster_md += f"<b style='color:#0d47a1;'>Main Themes:</b> <span style='color:#1976d2'>{lda_topics}</span><br>"
139
+
140
+ primary = cluster_primary_topics.get(cluster_label, [])
141
+ if primary:
142
+ cluster_md += f"<b style='color:#1b5e20;'>Primary Topics:</b> <span style='color:#388e3c'>{', '.join(primary)}</span><br>"
143
+
144
+ related = cluster_related_topics.get(cluster_label, [])
145
+ if related:
146
+ cluster_md += f"<b style='color:#616161;'>Related Topics:</b> <span style='color:#757575'>{', '.join(related)}</span><br>"
147
+
148
+ cluster_md += f"<b>Articles:</b> {len(articles)}<br><br>"
149
+
150
+ for sentiment in ["Positive", "Neutral", "Negative"]:
151
+ sentiment_articles = articles[articles["sentiment"] == sentiment]
152
+ if not sentiment_articles.empty:
153
+ color = {"Positive": "#e8f5e9", "Neutral": "#e3f2fd", "Negative": "#ffebee"}[sentiment]
154
+ border = {"Positive": "#43a047", "Neutral": "#1976d2", "Negative": "#c62828"}[sentiment]
155
+ sentiment_label = {
156
+ "Positive": "Positive News",
157
+ "Neutral": "Neutral News",
158
+ "Negative": "Negative News"
159
+ }[sentiment]
160
+ cluster_md += (
161
+ f"<div style='background:{color}; border-left:6px solid {border}; border-radius:6px; margin-bottom:10px; padding:10px;'>"
162
+ f"<span style='font-size:1.2em;'><b>{sentiment_label} ({len(sentiment_articles)})</b></span><br>"
163
+ )
164
+ for _, article in sentiment_articles.iterrows():
165
+ cluster_md += (
166
+ f"<div style='margin:10px 0 10px 0; padding:10px; border-bottom:1px solid #e0e0e0;'>"
167
+ f"<span style='font-weight:bold; color:#37474f;'>📰 {article['title']}</span><br>"
168
+ f"<span style='font-size:0.95em;'>"
169
+ f"<b>Source:</b> {article['source']}<br>"
170
+ f"<details><summary style='cursor:pointer; color:#1976d2;'><strong>Summary</strong></summary>"
171
+ f"<div style='margin-left:10px; color:#424242;'>{article['summary']}</div></details>"
172
+ f"<a href='{article['url']}' target='_blank' style='color:#1976d2;'>Read Full Article</a>"
173
+ f"</span></div>"
174
+ )
175
+ cluster_md += "</div>"
176
+ cluster_md += "</div>"
177
  markdown_blocks.append(cluster_md)
178
 
179
  while len(markdown_blocks) < 5:
 
188
  df.to_csv(csv_file, index=False)
189
  return csv_file, None
190
 
191
+ def update_ui_with_columns(topic, urls, sentiment_filters):
 
 
192
  extracted_articles = []
193
 
194
+ if topic and topic.strip():
195
+ return fetch_and_process_topic_news(topic, sentiment_filters)
 
 
 
 
 
196
 
197
  if urls:
198
  url_list = [url.strip() for url in urls.split("\n") if url.strip()]
199
  extracted_articles.extend(extract_summarize_and_analyze_content_from_urls(url_list))
200
 
201
  if not extracted_articles:
202
+ return sentiment_filters, "", "", "", "", "", None, None, None, gr.update(visible=False)
203
 
204
+ deduped_articles = deduplicate_articles(extracted_articles)
205
+ df = pd.DataFrame(deduped_articles)
206
  result = cluster_news.cluster_and_label_articles(df, content_column="content", summary_column="summary")
207
+ cluster_md_blocks = display_clusters_as_columns_grouped_by_sentiment(result, sentiment_filters)
208
  csv_file, _ = save_clustered_articles(result["dataframe"], topic or "batch_upload")
209
+ topic_fig = plot_topic_frequency(result)
210
+ sentiment_fig = plot_sentiment_trends(result)
211
+ top_clusters_table = render_top_clusters_table(result)
212
+ return sentiment_filters, *cluster_md_blocks, csv_file, topic_fig, sentiment_fig, top_clusters_table, gr.update(visible=True)
213
 
214
  def clear_interface():
215
  return (
216
  "", # topic_input
217
  ["Positive", "Neutral", "Negative"],# sentiment_filter
 
218
  "", # urls_input
219
  "", "", "", "", "", # cluster columns 0–4
220
+ gr.update(value=None), # csv_output (reset download file)
221
+ None, None, None, # topic_fig, sentiment_fig, top_clusters_table
222
+ gr.update(visible=False) # Hide Clustered News Digest section
223
  )
224
 
225
+ with gr.Blocks(theme=gr.themes.Base(), css="""
226
+ .gr-markdown { margin: 10px; }
227
+ .analytics-card {background: #f5f7fa; border-radius: 10px; padding: 18px; margin-bottom: 18px;}
228
+ """) as demo:
 
 
 
 
229
  gr.Markdown(
230
+ "<h1 style='text-align:center;'>📰 Quick Pulse</h1>"
231
+ "<h3 style='text-align:center; color:#1976d2;'>AI-Powered News Summarization with Real-Time Sentiment and Topic Insights</h3>"
232
+ "<p style='text-align:center;'>From headlines to insight, Quick Pulse summarizes news stories, captures emotional context, clusters related topics, and provides analytics at a glance.</p>"
233
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
  with gr.Row():
236
+ with gr.Column(scale=2):
237
+ topic_input = gr.Textbox(label="Enter Topic", placeholder="e.g. climate change")
238
+ sentiment_filter = gr.CheckboxGroup(choices=["Positive", "Neutral", "Negative"], value=["Positive", "Neutral", "Negative"], label="Sentiment Filter")
239
+ with gr.Accordion("🔗 Enter Multiple URLs", open=False):
240
+ urls_input = gr.Textbox(label="Enter URLs (newline separated)", lines=4)
241
+ with gr.Row():
242
+ submit_button = gr.Button(" Generate Digest", scale=1)
243
+ latest_news_button = gr.Button("Fetch & Summarize Top News", scale=1)
244
+ clear_button = gr.Button(" Clear", scale=1)
245
+ csv_output = gr.File(label="📁 Download Clustered Digest CSV")
246
+ with gr.Column(scale=3):
247
+ with gr.Row():
248
+ topic_fig = gr.Plot(label="Topic Frequency")
249
+ sentiment_fig = gr.Plot(label="Sentiment Trends")
250
+ top_clusters_table = gr.Dataframe(label="Top Clusters")
251
+
252
+ gr.Markdown("---")
253
+
254
+ clustered_digest_section = gr.Group(visible=False)
255
+ with clustered_digest_section:
256
+ gr.Markdown("<h3 style='color:#1976d2;'>Clustered News Digest</h3>")
257
+ with gr.Row():
258
+ column_0 = gr.Markdown()
259
+ column_1 = gr.Markdown()
260
+ column_2 = gr.Markdown()
261
+ column_3 = gr.Markdown()
262
+ column_4 = gr.Markdown()
263
 
264
  submit_button.click(
265
  fn=update_ui_with_columns,
266
+ inputs=[topic_input, urls_input, sentiment_filter],
267
  outputs=[
268
  sentiment_filter,
269
  column_0, column_1, column_2, column_3, column_4,
270
+ csv_output,
271
+ topic_fig, sentiment_fig, top_clusters_table,
272
+ clustered_digest_section
273
  ]
274
  )
275
 
 
279
  outputs=[
280
  sentiment_filter,
281
  column_0, column_1, column_2, column_3, column_4,
282
+ csv_output,
283
+ topic_fig, sentiment_fig, top_clusters_table,
284
+ clustered_digest_section
285
  ]
286
  )
287
 
288
  clear_button.click(
289
+ fn=clear_interface,
290
+ inputs=[],
291
+ outputs=[
292
+ topic_input, sentiment_filter, urls_input,
293
+ column_0, column_1, column_2, column_3, column_4,
294
+ csv_output,
295
+ topic_fig, sentiment_fig, top_clusters_table,
296
+ clustered_digest_section
297
+ ]
298
+ )
 
 
 
 
 
 
 
299
 
300
  if __name__ == "__main__":
301
+ demo.launch()
cluster_news.py CHANGED
@@ -1,224 +1,169 @@
 
 
 
 
1
  import numpy as np
2
  import pandas as pd
3
- from sklearn.cluster import KMeans
4
- from sklearn.feature_extraction.text import TfidfVectorizer
5
- from sklearn.decomposition import LatentDirichletAllocation
6
- from sklearn.metrics import silhouette_score
7
  from collections import defaultdict
8
  from sentence_transformers import SentenceTransformer
9
- from sklearn.metrics.pairwise import cosine_similarity
10
-
 
 
 
11
 
12
  def generate_embeddings(df, content_column):
13
- """
14
- Generate embeddings for the content using SentenceTransformer.
15
- """
16
- print("🔢 Generating embeddings for clustering...")
17
  model = SentenceTransformer('all-MiniLM-L6-v2')
18
  embeddings = model.encode(df[content_column].tolist(), show_progress_bar=True)
19
- return embeddings
20
-
21
-
22
- def determine_optimum_clusters(embeddings, min_clusters=2, max_clusters=10):
23
- """
24
- Determine the optimum number of clusters using silhouette analysis.
25
- """
26
- print("🔍 Determining the optimum number of clusters using silhouette analysis...")
27
- n_samples = len(embeddings)
28
- if n_samples < 2:
29
- raise ValueError("Not enough samples to perform clustering. At least 2 samples are required.")
30
-
31
- # Adjust max_clusters to ensure it does not exceed n_samples - 1
32
- max_clusters = min(max_clusters, n_samples - 1)
33
-
34
- best_num_clusters = min_clusters
35
- best_score = -1
36
-
37
- for n_clusters in range(min_clusters, max_clusters + 1):
38
- try:
39
- kmeans = KMeans(n_clusters=n_clusters, random_state=42)
40
- cluster_labels = kmeans.fit_predict(embeddings)
41
- score = silhouette_score(embeddings, cluster_labels)
42
- print(f"Number of clusters: {n_clusters}, Silhouette Score: {score:.4f}")
43
-
44
- if score > best_score:
45
- best_score = score
46
- best_num_clusters = n_clusters
47
- except ValueError as e:
48
- print(f"Skipping {n_clusters} clusters due to error: {e}")
49
-
50
- print(f"✅ Optimum number of clusters determined: {best_num_clusters}")
51
- return best_num_clusters
52
-
53
-
54
- def cluster_embeddings(embeddings, num_clusters):
55
- """
56
- Perform KMeans clustering on the embeddings.
57
- """
58
- print(f"📊 Clustering articles into {num_clusters} clusters using KMeans...")
59
- kmeans = KMeans(n_clusters=num_clusters, random_state=42)
60
- kmeans.fit(embeddings)
61
- return kmeans.labels_, kmeans
62
-
63
-
64
- def extract_tfidf_labels(df, content_column, cluster_labels):
65
- """
66
- Extract top TF-IDF keywords for each cluster.
67
- """
68
- print("🔠 Extracting TF-IDF-based keywords for cluster labels...")
69
  grouped = defaultdict(list)
70
  for idx, label in enumerate(cluster_labels):
 
71
  grouped[label].append(df.iloc[idx][content_column])
72
-
73
  tfidf_labels = {}
74
  for cluster_id, texts in grouped.items():
75
  vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=50)
76
  tfidf_matrix = vectorizer.fit_transform(texts)
77
  avg_tfidf = tfidf_matrix.mean(axis=0).A1
78
- top_indices = np.argsort(avg_tfidf)[::-1][:3]
 
 
 
79
  top_terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
80
- tfidf_labels[cluster_id] = ", ".join(top_terms)
81
-
82
  return tfidf_labels
83
 
84
- def apply_topic_modeling(df, content_column, cluster_labels, num_topics=2):
85
- """
86
- Apply topic modeling (LDA) within each cluster to refine and describe topics.
87
- """
88
- print("🔍 Applying topic modeling within each cluster...")
89
- grouped = defaultdict(list)
90
- for idx, label in enumerate(cluster_labels):
91
- grouped[label].append(df.iloc[idx][content_column])
92
-
93
- topic_labels = {}
94
- for cluster_id, texts in grouped.items():
95
- vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=5000)
96
- tfidf_matrix = vectorizer.fit_transform(texts)
97
-
98
- lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
99
- lda.fit(tfidf_matrix)
100
-
101
- # Extract top words for each topic
102
- feature_names = vectorizer.get_feature_names_out()
103
- topics = []
104
- for topic_idx, topic in enumerate(lda.components_):
105
- top_indices = topic.argsort()[:-4:-1]
106
- topics.append(", ".join([feature_names[i] for i in top_indices]))
107
- topic_labels[cluster_id] = " | ".join(topics)
108
-
109
- return topic_labels
110
-
111
-
112
- def filter_similar_topics(topic_keywords_list, threshold=0.75):
113
- """
114
- Filter out similar topics based on cosine similarity of their embeddings.
115
- """
116
- print("🔄 Filtering similar topics...")
117
- model = SentenceTransformer('all-MiniLM-L6-v2')
118
- topic_sentences = [", ".join(kw) for kw in topic_keywords_list]
119
- embeddings = model.encode(topic_sentences)
120
- unique_indices = []
121
- for i, emb in enumerate(embeddings):
122
- if all(cosine_similarity([emb], [embeddings[j]])[0][0] < threshold for j in unique_indices):
123
- unique_indices.append(i)
124
- return [topic_keywords_list[i] for i in unique_indices]
125
-
126
-
127
- def get_representative_summaries(df, summary_column, embeddings, cluster_labels, kmeans):
128
- """
129
- Get the most representative summary for each cluster based on proximity to the cluster centroid.
130
- """
131
- print("🔄 Refining cluster labels using representative summaries...")
132
- representatives = {}
133
- for i in range(kmeans.n_clusters):
134
- indices = [j for j, label in enumerate(cluster_labels) if label == i]
135
- if not indices:
136
  continue
137
- cluster_embeddings = embeddings[indices]
138
- centroid = kmeans.cluster_centers_[i]
139
- distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
140
- closest_idx = indices[np.argmin(distances)]
141
- representatives[i] = df.iloc[closest_idx][summary_column]
142
-
143
- return representatives
144
-
145
-
146
- def cluster_and_label_articles(df, content_column="content", summary_column="summary", min_clusters=2, max_clusters=10, max_topics=3):
147
- """
148
- Cluster articles using SentenceTransformer embeddings and label clusters using TF-IDF and Topic Modeling.
149
- Display detected topics for each cluster with Primary focus and Related topics.
150
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  if df.empty:
152
- print("No articles to cluster.")
153
  return None
154
 
155
- # Step 1: Generate embeddings
156
- embeddings = generate_embeddings(df, content_column)
157
 
158
- # Step 2: Determine the optimum number of clusters
159
- num_clusters = determine_optimum_clusters(embeddings, min_clusters, max_clusters)
160
-
161
- # Step 3: Perform clustering
162
- cluster_labels, kmeans = cluster_embeddings(embeddings, num_clusters)
163
- df['cluster_label'] = cluster_labels
164
 
165
- # Step 4: Extract TF-IDF matrix
166
- print("🔠 Extracting TF-IDF matrix for clusters...")
167
- vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=5000)
168
- tfidf_matrix = vectorizer.fit_transform(df[content_column].tolist())
169
- feature_names = vectorizer.get_feature_names_out()
170
 
171
- # Step 5: Process each cluster
172
- print("🔍 Processing clusters for TF-IDF and topic modeling...")
173
- grouped = defaultdict(list)
174
- for idx, label in enumerate(cluster_labels):
175
- grouped[label].append(idx)
176
-
177
- refined_labels = [""] * num_clusters # Initialize refined_labels with empty strings
178
- detected_topics = {}
179
- for cluster_id, indices in grouped.items():
180
- cluster_texts = tfidf_matrix[indices]
181
-
182
- # Extract TF-IDF keywords
183
- avg_tfidf = cluster_texts.mean(axis=0).A1
184
- top_indices = np.argsort(avg_tfidf)[::-1][:3]
185
- tfidf_keywords = [feature_names[i] for i in top_indices]
186
-
187
- # Generate a cluster label using the top TF-IDF keywords
188
- cluster_label_tfidf = ", ".join(tfidf_keywords)
189
-
190
- # Apply topic modeling
191
- lda = LatentDirichletAllocation(n_components=min(max_topics, len(indices)), random_state=42)
192
- lda.fit(cluster_texts)
193
- topics = []
194
- topic_weights = []
195
- for topic_idx, topic in enumerate(lda.components_):
196
- top_topic_indices = topic.argsort()[:-4:-1]
197
- topics.append(", ".join([feature_names[i] for i in top_topic_indices]))
198
- topic_weights.append(topic.sum()) # Sum of weights for ranking
199
-
200
- # Rank topics by importance
201
- ranked_topics = [x for _, x in sorted(zip(topic_weights, topics), reverse=True)]
202
-
203
- # Generate Primary focus and Related topics
204
- primary_focus = ranked_topics[0] if ranked_topics else "N/A"
205
- related_topics = ranked_topics[1:] if len(ranked_topics) > 1 else []
206
-
207
- # Store detected topics for user display
208
- detected_topics[cluster_label_tfidf] = {
209
- "primary_focus": primary_focus,
210
- "related_topics": related_topics,
211
  }
 
 
212
 
213
- # Assign the TF-IDF keywords as the cluster label
214
- refined_labels[cluster_id] = cluster_label_tfidf
215
-
216
- # Assign refined labels to clusters
217
- df['cluster_label'] = [refined_labels[label] for label in cluster_labels]
218
-
219
- print("✅ Clustering and labeling complete!")
220
  return {
221
  "dataframe": df,
222
  "detected_topics": detected_topics,
223
- "number_of_clusters": num_clusters,
 
 
224
  }
 
1
+ # cluster_news.py
2
+ # Clusters news articles using HDBSCAN, labels clusters with TF-IDF n-grams and LDA topics,
3
+ # and falls back to a representative summary if the label is too vague.
4
+
5
  import numpy as np
6
  import pandas as pd
 
 
 
 
7
  from collections import defaultdict
8
  from sentence_transformers import SentenceTransformer
9
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
10
+ from sklearn.metrics.pairwise import cosine_distances
11
+ from sklearn.decomposition import LatentDirichletAllocation
12
+ import hdbscan
13
+ import umap
14
 
15
  def generate_embeddings(df, content_column):
 
 
 
 
16
  model = SentenceTransformer('all-MiniLM-L6-v2')
17
  embeddings = model.encode(df[content_column].tolist(), show_progress_bar=True)
18
+ return np.array(embeddings)
19
+
20
+ def reduce_dimensions(embeddings, n_neighbors=10, min_dist=0.0, n_components=5, random_state=42):
21
+ n_samples = embeddings.shape[0]
22
+ if n_samples < 3:
23
+ return embeddings
24
+ n_components = min(max(2, n_components), n_samples - 2)
25
+ n_neighbors = min(max(2, n_neighbors), n_samples - 1)
26
+ reducer = umap.UMAP(
27
+ n_neighbors=n_neighbors,
28
+ min_dist=min_dist,
29
+ n_components=n_components,
30
+ random_state=random_state,
31
+ metric='cosine'
32
+ )
33
+ reduced = reducer.fit_transform(embeddings)
34
+ return reduced
35
+
36
+ def cluster_with_hdbscan(embeddings, min_cluster_size=2, min_samples=1):
37
+ clusterer = hdbscan.HDBSCAN(
38
+ min_cluster_size=min_cluster_size,
39
+ min_samples=min_samples,
40
+ metric='euclidean'
41
+ )
42
+ labels = clusterer.fit_predict(embeddings)
43
+ return labels, clusterer
44
+
45
+ def extract_tfidf_labels(df, content_column, cluster_labels, top_n=6):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  grouped = defaultdict(list)
47
  for idx, label in enumerate(cluster_labels):
48
+ if label == -1: continue
49
  grouped[label].append(df.iloc[idx][content_column])
 
50
  tfidf_labels = {}
51
  for cluster_id, texts in grouped.items():
52
  vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=50)
53
  tfidf_matrix = vectorizer.fit_transform(texts)
54
  avg_tfidf = tfidf_matrix.mean(axis=0).A1
55
+ if len(avg_tfidf) == 0:
56
+ tfidf_labels[cluster_id] = []
57
+ continue
58
+ top_indices = np.argsort(avg_tfidf)[::-1][:top_n]
59
  top_terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
60
+ tfidf_labels[cluster_id] = top_terms
 
61
  return tfidf_labels
62
 
63
+ def lda_topic_modeling(texts, n_topics=1, n_words=6):
64
+ vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000)
65
+ X = vectorizer.fit_transform(texts)
66
+ if X.shape[0] < n_topics:
67
+ n_topics = max(1, X.shape[0])
68
+ lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
69
+ lda.fit(X)
70
+ topic_words = []
71
+ for topic_idx, topic in enumerate(lda.components_):
72
+ top_indices = topic.argsort()[:-n_words - 1:-1]
73
+ words = [vectorizer.get_feature_names_out()[i] for i in top_indices]
74
+ topic_words.extend(words)
75
+ return topic_words
76
+
77
+ def get_representative_summary(df, cluster_indices, embeddings, centroid):
78
+ cluster_embs = embeddings[cluster_indices]
79
+ dists = cosine_distances(cluster_embs, centroid.reshape(1, -1)).flatten()
80
+ min_idx = np.argmin(dists)
81
+ return df.iloc[cluster_indices[min_idx]]["summary"]
82
+
83
+ def label_clusters_hybrid(df, content_column, summary_column, cluster_labels, embeddings, tfidf_labels, lda_labels, vague_threshold=15):
84
+ cluster_label_map = {}
85
+ cluster_primary_topics = {}
86
+ cluster_related_topics = {}
87
+ for cluster_id in set(cluster_labels):
88
+ if cluster_id == -1:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  continue
90
+ topics = lda_labels.get(cluster_id, []) or tfidf_labels.get(cluster_id, [])
91
+ topics = [t for t in topics if t]
92
+ primary_topics = topics[:3]
93
+ related_topics = topics[3:]
94
+ label = ", ".join(primary_topics) if primary_topics else ""
95
+ if not label or len(label) < vague_threshold:
96
+ cluster_indices = np.where(cluster_labels == cluster_id)[0]
97
+ centroid = embeddings[cluster_indices].mean(axis=0)
98
+ rep_summary = get_representative_summary(df, cluster_indices, embeddings, centroid)
99
+ label = rep_summary[:80] + "..." if len(rep_summary) > 80 else rep_summary
100
+ cluster_label_map[cluster_id] = label
101
+ cluster_primary_topics[cluster_id] = primary_topics
102
+ cluster_related_topics[cluster_id] = related_topics
103
+ return cluster_label_map, cluster_primary_topics, cluster_related_topics
104
+
105
+ def cluster_and_label_articles(
106
+ df,
107
+ content_column="content",
108
+ summary_column="summary",
109
+ min_cluster_size=2,
110
+ min_samples=1,
111
+ n_neighbors=10,
112
+ min_dist=0.0,
113
+ n_components=5,
114
+ top_n=6,
115
+ lda_n_topics=1,
116
+ lda_n_words=6,
117
+ vague_threshold=15
118
+ ):
119
  if df.empty:
 
120
  return None
121
 
122
+ min_cluster_size = max(2, min(min_cluster_size, len(df) // 2)) if len(df) < 20 else min_cluster_size
 
123
 
124
+ embeddings = generate_embeddings(df, content_column)
125
+ reduced_embeddings = reduce_dimensions(embeddings, n_neighbors, min_dist, n_components)
126
+ cluster_labels, clusterer = cluster_with_hdbscan(reduced_embeddings, min_cluster_size, min_samples)
127
+ df['cluster_id'] = cluster_labels
 
 
128
 
129
+ tfidf_labels = extract_tfidf_labels(df, content_column, cluster_labels, top_n=top_n)
 
 
 
 
130
 
131
+ lda_labels = {}
132
+ for cluster_id in set(cluster_labels):
133
+ if cluster_id == -1:
134
+ continue
135
+ cluster_texts = df[cluster_labels == cluster_id][content_column].tolist()
136
+ if cluster_texts:
137
+ topics = lda_topic_modeling(
138
+ cluster_texts, n_topics=lda_n_topics, n_words=lda_n_words
139
+ )
140
+ lda_labels[cluster_id] = topics
141
+ else:
142
+ lda_labels[cluster_id] = []
143
+
144
+ cluster_label_map, cluster_primary_topics, cluster_related_topics = label_clusters_hybrid(
145
+ df, content_column, summary_column, cluster_labels, embeddings, tfidf_labels, lda_labels, vague_threshold=vague_threshold
146
+ )
147
+
148
+ df['cluster_label'] = [
149
+ cluster_label_map.get(cid, "Noise/Other") if cid != -1 else "Noise/Other"
150
+ for cid in cluster_labels
151
+ ]
152
+ df['lda_topics'] = [
153
+ ", ".join(lda_labels.get(cid, [])) if cid != -1 else "" for cid in cluster_labels
154
+ ]
155
+
156
+ detected_topics = {
157
+ label: {
158
+ "size": int((df['cluster_label'] == label).sum())
 
 
 
 
 
 
 
 
 
 
 
 
159
  }
160
+ for label in set(df['cluster_label']) if label != "Noise/Other"
161
+ }
162
 
 
 
 
 
 
 
 
163
  return {
164
  "dataframe": df,
165
  "detected_topics": detected_topics,
166
+ "number_of_clusters": len(detected_topics),
167
+ "cluster_primary_topics": cluster_primary_topics,
168
+ "cluster_related_topics": cluster_related_topics
169
  }
extract_news.py CHANGED
@@ -1,244 +1,36 @@
1
  # extract_news.py
 
2
 
3
- # This script is designed to extract news articles from various sources, including NewsAPI and Google News RSS using the URLs saved from the gather_news.py file.
4
- # It includes functions for extracting clean,full-text content from the articles, and storing the metadata into a file.
5
-
6
-
7
- # Article Scraping & Text Extraction
8
-
9
- from newspaper import Article
10
- import pandas as pd
11
  import logging
12
- import requests
13
- from bs4 import BeautifulSoup
14
-
15
-
16
- # * For each URL from NewsAPI or RSS, * Create Article(url)* Call .download(), .parse(), .text and * Optionally use .nlp() to get summary and keywords
17
 
18
- def extract_full_content(url, min_length=300):
19
- """
20
- Extract full content and title from the given URL using newspaper3k.
21
- Always returns a tuple (content, title) or (None, None).
22
- """
23
  try:
24
  article = Article(url)
25
  article.download()
26
  article.parse()
27
-
28
  text = article.text.strip()
29
  title = article.title.strip() if article.title else "Untitled"
30
-
31
- # Filter out short content
32
  if len(text) < min_length:
33
  logging.warning(f"Extracted content is too short from {url}.")
34
- return None, None
35
-
36
- return text, title
37
-
38
  except Exception as e:
39
  logging.error(f"Failed to extract content from {url}: {str(e)}")
40
- return None, None
41
-
42
-
43
- def extract_full_content_rss(url, min_length=300):
44
- """
45
- Extract full content and title from an RSS article using BeautifulSoup.
46
- Always returns a tuple: (text, title) or (None, None).
47
- """
48
- try:
49
- response = requests.get(url, timeout=10)
50
- if response.status_code != 200:
51
- logging.error(f"Error fetching URL {url}: {response.status_code}")
52
- return None, None
53
-
54
- soup = BeautifulSoup(response.content, 'html.parser')
55
- title = soup.title.string.strip() if soup.title and soup.title.string else "Untitled"
56
- paragraphs = soup.find_all('p')
57
- text = ' '.join([para.get_text() for para in paragraphs]).strip()
58
-
59
- if len(text) < min_length:
60
- logging.warning(f"Extracted content is too short from {url}.")
61
- return None, None
62
-
63
- return text, title
64
-
65
- except Exception as e:
66
- logging.error(f"Error extracting content from {url}: {str(e)}")
67
- return None, None
68
-
69
-
70
- # * Handle common edge cases such as * Paywalled content (skip or tag) and * Duplicate links or broken URLs
71
- def is_paywalled(url):
72
- """
73
- * Check if the URL is paywalled
74
- """
75
- paywall_indicators = ['paywall', 'subscription', 'premium']
76
- return any(indicator in url for indicator in paywall_indicators)
77
-
78
- def is_paywalled_content(article):
79
- """
80
- * Check if the article is paywalled
81
- """
82
- if not article:
83
- return False
84
- if not article.get("text"):
85
- return False
86
- if is_paywalled(article.get("url", "")):
87
- return True
88
- return False
89
 
90
- def is_duplicate(url, existing_urls):
91
- """
92
- * Check if the URL is a duplicate
93
- """
94
- return url in existing_urls
95
-
96
- def is_broken(url):
97
- """
98
- * Check if the URL is broken
99
- """
100
- try:
101
- response = requests.head(url, allow_redirects=True)
102
- return response.status_code != 200
103
- except requests.RequestException:
104
- return True
105
-
106
- def is_valid_url(url):
107
- """
108
- * Check if the URL is valid
109
- """
110
- regex = re.compile(
111
- r'^(?:http|ftp)s?://' # http:// or https://
112
- r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
113
- r'localhost|' # localhost...
114
- r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
115
- r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
116
- r'(?::\d+)?' # optional port
117
- r'(?:/?|[/?]\S+)$', re.IGNORECASE)
118
- return re.match(regex, url) is not None
119
-
120
- def is_valid_url_content(url):
121
- """
122
- * Check if the URL is valid
123
- """
124
- if not url:
125
- return False
126
- if not is_valid_url(url):
127
- return False
128
- if is_paywalled(url):
129
- return False
130
- if is_broken(url):
131
- return False
132
- return True
133
-
134
- # Additional functions to check if the article have empty content or blocked sites
135
-
136
- def is_empty_content(article):
137
- """
138
- * Check if the article content is empty
139
- """
140
- if not article:
141
- return True
142
- if not article.get("text"):
143
- return True
144
- return False
145
-
146
- def is_blocked_site(url):
147
- """
148
- * Check if the URL is from a blocked site
149
- """
150
- blocked_sites = ['example.com', 'blockedsite.com'] # Add your blocked sites here
151
- return any(blocked_site in url for blocked_site in blocked_sites)
152
-
153
- def is_blocked_content(article):
154
- """
155
- * Check if the article is from a blocked site
156
- """
157
- if not article:
158
- return False
159
- if not article.get("text"):
160
- return False
161
- if is_blocked_site(article.get("url", "")):
162
- return True
163
- return False
164
-
165
- # Extract news articles from the given URLs
166
-
167
- def extract_news_articles(urls):
168
- """
169
- * Extract news articles from the given URLs
170
- """
171
  extracted_articles = []
172
- existing_urls = set()
173
-
174
  for url in urls:
175
- if not is_valid_url_content(url):
176
- logging.warning(f"Skipping invalid or paywalled URL: {url}")
177
- continue
178
- if is_duplicate(url, existing_urls):
179
- logging.warning(f"Skipping duplicate URL: {url}")
180
- continue
181
- existing_urls.add(url)
182
-
183
- article = extract_full_content(url)
184
- if not article:
185
- logging.warning(f"Failed to extract content from {url}")
186
- continue
187
-
188
- if is_paywalled_content(article):
189
- logging.warning(f"Skipping paywalled content from URL: {url}")
190
- continue
191
-
192
- extracted_articles.append(article)
193
-
194
- return extracted_articles
195
-
196
- def extract_news_articles_rss(urls):
197
- """
198
- * Extract news articles from the given RSS URLs
199
- """
200
- extracted_articles = []
201
- existing_urls = set()
202
-
203
- for url in urls:
204
- if not is_valid_url_content(url):
205
- logging.warning(f"Skipping invalid or paywalled URL: {url}")
206
- continue
207
- if is_duplicate(url, existing_urls):
208
- logging.warning(f"Skipping duplicate URL: {url}")
209
- continue
210
- existing_urls.add(url)
211
-
212
- article = extract_full_content_rss(url)
213
- if not article:
214
- logging.warning(f"Failed to extract content from {url}")
215
- continue
216
-
217
- if is_paywalled_content(article):
218
- logging.warning(f"Skipping paywalled content from URL: {url}")
219
- continue
220
-
221
- extracted_articles.append(article)
222
-
223
  return extracted_articles
224
 
225
- # Metadata Structuring and Storage
226
- # Functions to create a dataframe with all the metadata for extracted fields title,url,source,author, published_at and full_text for each extracted article and save it to a csv file
227
-
228
  def create_dataframe(articles):
229
- """
230
- Create a pandas DataFrame from the list of articles.
231
- """
232
  return pd.DataFrame(articles)
233
 
234
  def save_to_csv(df, filename):
235
- """
236
- Save the DataFrame to a CSV file.
237
- """
238
- df.to_csv(filename, index=False)
239
-
240
- def save_to_json(df, filename):
241
- """
242
- Save the DataFrame to a JSON file.
243
- """
244
- df.to_json(filename, orient="records", lines=True)
 
1
  # extract_news.py
2
+ # This script extracts full content from news articles using the newspaper3k library.
3
 
 
 
 
 
 
 
 
 
4
  import logging
5
+ import pandas as pd
6
+ from newspaper import Article
 
 
 
7
 
8
+ def extract_full_content(url, min_length=100):
 
 
 
 
9
  try:
10
  article = Article(url)
11
  article.download()
12
  article.parse()
 
13
  text = article.text.strip()
14
  title = article.title.strip() if article.title else "Untitled"
 
 
15
  if len(text) < min_length:
16
  logging.warning(f"Extracted content is too short from {url}.")
17
+ return None
18
+ return {"url": url, "text": text, "title": title}
 
 
19
  except Exception as e:
20
  logging.error(f"Failed to extract content from {url}: {str(e)}")
21
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ def extract_news_articles(urls, min_length=100):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  extracted_articles = []
 
 
25
  for url in urls:
26
+ article = extract_full_content(url, min_length=min_length)
27
+ if article and article.get("text"):
28
+ article["original_url"] = url
29
+ extracted_articles.append(article)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  return extracted_articles
31
 
 
 
 
32
  def create_dataframe(articles):
 
 
 
33
  return pd.DataFrame(articles)
34
 
35
  def save_to_csv(df, filename):
36
+ df.to_csv(filename, index=False)
 
 
 
 
 
 
 
 
 
gather_news.py CHANGED
@@ -1,73 +1,121 @@
1
  # gather_news.py
2
-
3
-
4
  # News Source Integration
5
- # This script integrates with various news sources to fetch the latest articles from the specified news sources, extracts relevant information such as title, URL,Source,Author and Publish date.
 
6
 
7
  import requests
8
- import feedparser
9
- import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- def fetch_articles_newsapi(topic):
12
- """
13
- Fetch articles from NewsAPI based on the provided topic.
14
- """
15
  url = 'https://newsapi.org/v2/everything'
16
- api_key = os.environ.get("api_key") # Make sure the key name matches what's in HF settings
17
- if not api_key:
18
- raise ValueError("API_KEY is not set in environment variables.")
19
  params = {
20
- 'apiKey': api_key,
21
  'language': 'en',
22
  'q': topic,
23
- 'pageSize': 20
 
24
  }
25
- try:
26
- response = requests.get(url, params=params)
27
- if response.status_code != 200:
28
- return f"Error: Failed to fetch news. Status code: {response.status_code}"
29
-
30
- articles = response.json().get("articles", [])
31
- if not articles:
32
- return "No articles found."
33
-
34
- # Extract relevant information from each article
35
- extracted_articles = []
36
- for article in articles:
37
- extracted_articles.append({
38
- "title": article.get("title", "No title"),
39
- "url": article.get("url", "#"),
40
- "source": article.get("source", {}).get("name", "Unknown"),
41
- "author": article.get("author", "Unknown"),
42
- "publishedAt": article.get("publishedAt", "Unknown")
43
- })
44
-
45
- return extracted_articles
46
- except Exception as e:
47
- return f"Error fetching news: {str(e)}"
48
-
49
- def fetch_articles_google(topic):
50
- """
51
- Fetch articles from Google News RSS feed based on the provided topic.
52
- """
53
- rss_url = f'https://news.google.com/rss/search?q={topic}&hl=en-US&gl=US&ceid=US:en'
54
- try:
55
- feed = feedparser.parse(rss_url)
56
- if not feed.entries:
57
- return "No articles found."
58
-
59
- # Extract relevant information from each article
60
- extracted_articles = []
61
- for entry in feed.entries[:20]: # Limit to top 20 articles
62
- extracted_articles.append({
63
- "title": entry.title,
64
- "url": entry.link,
65
- "source": entry.source.title if hasattr(entry, 'source') else "Unknown",
66
- "author": entry.author if hasattr(entry, 'author') else "Unknown",
67
- "publishedAt": entry.published if hasattr(entry, 'published') else "Unknown"
68
- })
69
 
70
- return extracted_articles
71
- except Exception as e:
72
- return f"Error fetching news: {str(e)}"
73
-
 
 
1
  # gather_news.py
 
 
2
  # News Source Integration
3
+ # This script integrates with various news sources to fetch the latest articles from the specified news sources,
4
+ # extracts relevant information such as title, URL, Source, Author and Publish date, and extracts full content.
5
 
6
  import requests
7
+ from extract_news import extract_news_articles, create_dataframe, save_to_csv
8
+
9
+ def fetch_newsapi_top_headlines(min_length=100, max_articles=30):
10
+ import config
11
+ url = 'https://newsapi.org/v2/top-headlines'
12
+ params = {
13
+ 'apiKey': config.api_key,
14
+ 'language': 'en',
15
+ 'pageSize': max_articles
16
+ }
17
+ response = requests.get(url, params=params)
18
+ if response.status_code != 200:
19
+ print(f"Error: Failed to fetch news from NewsAPI Top Headlines. Status code: {response.status_code}")
20
+ return []
21
+ articles = response.json().get("articles", [])
22
+ if not articles:
23
+ print("No articles found in NewsAPI Top Headlines.")
24
+ return []
25
+ meta_by_url = {}
26
+ urls = []
27
+ for article in articles:
28
+ url = article.get("url", "#")
29
+ meta = {
30
+ "url": url,
31
+ "title": article.get("title", ""),
32
+ "source": article.get("source", {}).get("name", ""),
33
+ "author": article.get("author", "Unknown"),
34
+ "publishedAt": article.get("publishedAt", "Unknown"),
35
+ }
36
+ meta_by_url[url] = meta
37
+ urls.append(url)
38
+ print(f"Fetched {len(urls)} article URLs from NewsAPI Top Headlines.")
39
+ extracted_articles = extract_news_articles(urls, min_length=min_length)
40
+ merged_articles = []
41
+ for art in extracted_articles:
42
+ meta = meta_by_url.get(art.get("original_url"))
43
+ if not meta:
44
+ meta = {
45
+ "title": art.get("title", "Untitled"),
46
+ "source": "",
47
+ "author": "Unknown",
48
+ "publishedAt": "Unknown"
49
+ }
50
+ merged = {
51
+ "url": art.get("url"),
52
+ "title": art.get("title") if art.get("title") and art.get("title") != "Untitled" else meta["title"],
53
+ "source": meta["source"],
54
+ "author": meta["author"],
55
+ "publishedAt": meta["publishedAt"],
56
+ "text": art.get("text", ""),
57
+ }
58
+ merged_articles.append(merged)
59
+ print(f"Usable articles after extraction (NewsAPI Top Headlines): {len(merged_articles)}")
60
+ return merged_articles
61
 
62
+ def fetch_newsapi_everything(topic, min_length=100, max_articles=50):
63
+ import config
 
 
64
  url = 'https://newsapi.org/v2/everything'
 
 
 
65
  params = {
66
+ 'apiKey': config.api_key,
67
  'language': 'en',
68
  'q': topic,
69
+ 'pageSize': max_articles,
70
+ 'sortBy': 'publishedAt'
71
  }
72
+ response = requests.get(url, params=params)
73
+ if response.status_code != 200:
74
+ print(f"Error: Failed to fetch news from NewsAPI Everything. Status code: {response.status_code}")
75
+ return []
76
+ articles = response.json().get("articles", [])
77
+ if not articles:
78
+ print("No articles found in NewsAPI Everything.")
79
+ return []
80
+ meta_by_url = {}
81
+ urls = []
82
+ for article in articles:
83
+ url = article.get("url", "#")
84
+ meta = {
85
+ "url": url,
86
+ "title": article.get("title", ""),
87
+ "source": article.get("source", {}).get("name", ""),
88
+ "author": article.get("author", "Unknown"),
89
+ "publishedAt": article.get("publishedAt", "Unknown"),
90
+ }
91
+ meta_by_url[url] = meta
92
+ urls.append(url)
93
+ print(f"Fetched {len(urls)} article URLs from NewsAPI Everything.")
94
+ extracted_articles = extract_news_articles(urls, min_length=min_length)
95
+ merged_articles = []
96
+ for art in extracted_articles:
97
+ meta = meta_by_url.get(art.get("original_url"))
98
+ if not meta:
99
+ meta = {
100
+ "title": art.get("title", "Untitled"),
101
+ "source": "",
102
+ "author": "Unknown",
103
+ "publishedAt": "Unknown"
104
+ }
105
+ merged = {
106
+ "url": art.get("url"),
107
+ "title": art.get("title") if art.get("title") and art.get("title") != "Untitled" else meta["title"],
108
+ "source": meta["source"],
109
+ "author": meta["author"],
110
+ "publishedAt": meta["publishedAt"],
111
+ "text": art.get("text", ""),
112
+ }
113
+ merged_articles.append(merged)
114
+ print(f"Usable articles after extraction (NewsAPI Everything): {len(merged_articles)}")
115
+ return merged_articles
116
 
117
+ def fetch_articles(topic=None, min_length=100, max_articles=30):
118
+ if topic and topic.strip():
119
+ return fetch_newsapi_everything(topic, min_length=min_length, max_articles=max_articles)
120
+ else:
121
+ return fetch_newsapi_top_headlines(min_length=min_length, max_articles=max_articles)
input_topic.py CHANGED
@@ -1,7 +1,4 @@
1
  # input_topic.py
2
-
3
-
4
- # Input Design
5
  # This script is designed to take user input for a topic or a keyword and validate it before using it in a news summarization application.
6
 
7
  def get_topic():
 
1
  # input_topic.py
 
 
 
2
  # This script is designed to take user input for a topic or a keyword and validate it before using it in a news summarization application.
3
 
4
  def get_topic():
requirements.txt CHANGED
@@ -9,4 +9,7 @@ numpy
9
  requests
10
  gradio
11
  lxml_html_clean
 
 
 
12
  sentence_transformers
 
9
  requests
10
  gradio
11
  lxml_html_clean
12
+ plotly.express
13
+ hdbscan
14
+ umap
15
  sentence_transformers
summarizer.py CHANGED
@@ -6,14 +6,6 @@ from transformers import pipeline
6
  # Load summarization pipeline
7
  summarizer = pipeline("summarization", model="harao-ml/flant5-finetuned-summarize")
8
 
9
- # Load once globally
10
-
11
- #from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
12
-
13
- #tokenizer = AutoTokenizer.from_pretrained("flant5-base")
14
- #model = AutoModelForSeq2SeqLM.from_pretrained("flant5-base")
15
- #summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
16
-
17
  # Function to split text into smaller chunks
18
  def split_text(text, max_tokens=512):
19
  words = text.split()
 
6
  # Load summarization pipeline
7
  summarizer = pipeline("summarization", model="harao-ml/flant5-finetuned-summarize")
8
 
 
 
 
 
 
 
 
 
9
  # Function to split text into smaller chunks
10
  def split_text(text, max_tokens=512):
11
  words = text.split()