Spaces:

harao-ml
/

QuickPulse

Running

App Files Files Community

harao-ml commited on Jul 13

Commit

97420da

verified ·

1 Parent(s): a445055

Upload 8 files

Browse files

Files changed (8) hide show

analyze_sentiment.py +9 -13
app.py +208 -174
cluster_news.py +137 -192
extract_news.py +13 -221
gather_news.py +110 -62
input_topic.py +0 -3
requirements.txt +3 -0
summarizer.py +0 -8

analyze_sentiment.py CHANGED Viewed

@@ -1,28 +1,24 @@
 # analyze_sentiment.py
 # This script analyzes the sentiment of the summarized content using the Hugging Face Transformers library.
 from transformers import pipeline
-# Load sentiment analysis pipeline
-sentiment_analyzer = pipeline("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")
 def analyze_summary(summary):
     """
-    Analyze the sentiment of the given summary.
     Returns a tuple of (sentiment, score).
     """
     try:
         if not summary.strip():
             return "No input provided.", 0.0
-        result = sentiment_analyzer(summary)[0]
-        sentiment = result['label']
-        score = result['score']
         return sentiment, score
     except Exception as e:
-        return f"Error analyzing sentiment: {str(e)}", 0.0
-# Example usage

 # analyze_sentiment.py
 # This script analyzes the sentiment of the summarized content using the Hugging Face Transformers library.
 from transformers import pipeline
+# Load zero-shot classification pipeline
+classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
 def analyze_summary(summary):
     """
+    Analyze the sentiment of the given summary using zero-shot classification.
     Returns a tuple of (sentiment, score).
     """
     try:
         if not summary.strip():
             return "No input provided.", 0.0
+        candidate_labels = ["positive", "neutral", "negative"]
+        result = classifier(summary, candidate_labels)
+        sentiment = result['labels'][0].capitalize()
+        score = float(result['scores'][0])
         return sentiment, score
     except Exception as e:
+        return f"Error analyzing sentiment: {str(e)}", 0.0

app.py CHANGED Viewed

@@ -1,110 +1,123 @@
-import gradio as gr
 import pandas as pd
 import cluster_news
-import extract_news
 import summarizer
 import analyze_sentiment
-import gather_news
-# ------------------ Utilities ------------------
-def fetch_content(topic):
-    articles = gather_news.fetch_articles_newsapi(topic)
-    if isinstance(articles, str):
-        articles = gather_news.fetch_articles_google(topic)
-        if isinstance(articles, str):
-            return None
-    try:
-        articles = sorted(articles, key=lambda x: x.get("publishedAt", ""), reverse=True)[:10]
-    except Exception:
-        return None
-    return articles
 def fetch_and_process_latest_news(sentiment_filters):
-    topic = "Top Headlines"
-    articles = gather_news.fetch_articles_newsapi("top headlines")
-    if isinstance(articles, str) or not articles:
-        return sentiment_filters, "### No latest news available", "", "", "", "", None
-    articles = sorted(articles, key=lambda x: x.get("publishedAt", ""), reverse=True)[:10]
-    extracted_articles = extract_summarize_and_analyze_articles(articles)
-    if not extracted_articles:
-        return sentiment_filters, "### No content to display", "", "", "", "", None
-    df = pd.DataFrame(extracted_articles)
     result = cluster_news.cluster_and_label_articles(df, content_column="content", summary_column="summary")
-    cluster_md_blocks = display_clusters_as_columns(result, sentiment_filters)
-    csv_file, _ = save_clustered_articles(result["dataframe"], topic)
-    return sentiment_filters, *cluster_md_blocks, csv_file
 def extract_summarize_and_analyze_articles(articles):
     extracted_articles = []
     for article in articles:
-        url = article.get("url")
-        if url:
-            content, _ = extract_news.extract_full_content(url)
-            if content:
-                summary = summarizer.generate_summary(content)
-                sentiment, score = analyze_sentiment.analyze_summary(summary)
-                extracted_articles.append({
-                    "title": article.get("title", "No title"),
-                    "url": url,
-                    "source": article.get("source", "Unknown"),
-                    "author": article.get("author", "Unknown"),
-                    "publishedAt": article.get("publishedAt", "Unknown"),
-                    "content": content,
-                    "summary": summary,
-                    "sentiment": sentiment,
-                    "score": score
-                })
     return extracted_articles
-def extract_summarize_and_analyze_content_from_file(files):
-    extracted_articles = []
-    for file in files:
-        with open(file.name, "r", encoding="utf-8") as f:
-            content = f.read()
-            if content.strip():
-                summary = summarizer.generate_summary(content)
-                sentiment, score = analyze_sentiment.analyze_summary(summary)
-                extracted_articles.append({
-                    "title": "Custom File",
-                    "url": "N/A",
-                    "source": "Uploaded File",
-                    "author": "Unknown",
-                    "publishedAt": "Unknown",
-                    "content": content,
-                    "summary": summary,
-                    "sentiment": sentiment,
-                    "score": score
-                })
-    return extracted_articles
 def extract_summarize_and_analyze_content_from_urls(urls):
-    extracted_articles = []
-    for url in urls:
-        content, title = extract_news.extract_full_content(url)
-        if content:  # Only proceed if content is successfully extracted
-            summary = summarizer.generate_summary(content)
-            sentiment, score = analyze_sentiment.analyze_summary(summary)
-            extracted_articles.append({
-                "title": title if title else "Untitled Article",
-                "url": url,
-                "source": "External Link",
-                "author": "Unknown",
-                "publishedAt": "Unknown",
-                "content": content,
-                "summary": summary,
-                "sentiment": sentiment,
-                "score": score
-            })
-    return extracted_articles
-def display_clusters_as_columns(result, sentiment_filters=None):
     df = result["dataframe"]
-    detected_topics = result.get("detected_topics", {})
     df["sentiment"] = df["sentiment"].str.capitalize()
     if sentiment_filters:
@@ -117,24 +130,50 @@ def display_clusters_as_columns(result, sentiment_filters=None):
     markdown_blocks = []
     for cluster_label, articles in clusters:
-        cluster_md = f"### 🧩 Cluster {cluster_label}\n"
-        if cluster_label in detected_topics:
-            topics = detected_topics[cluster_label]
-            cluster_md += f"**Primary Topic:** {topics['primary_focus']}\n\n"
-            if topics["related_topics"]:
-                cluster_md += f"**Related Topics:** {', '.join(topics['related_topics'])}\n\n"
-        cluster_md += f"**Articles:** {len(articles)}\n\n"
-        for _, article in articles.iterrows():
-            cluster_md += (
-                f"#### 📰 {article['title']}\n"
-                f"- **Source:** {article['source']}\n"
-                f"- **Sentiment:** {article['sentiment']}\n"
-                f"<details><summary><strong>Summary</strong></summary>\n"
-                f"{article['summary']}\n"
-                f"</details>\n"
-                f"- [Read Full Article]({article['url']})\n\n"
-            )
         markdown_blocks.append(cluster_md)
     while len(markdown_blocks) < 5:
@@ -149,88 +188,88 @@ def save_clustered_articles(df, topic):
     df.to_csv(csv_file, index=False)
     return csv_file, None
-# ------------------ Pipeline Trigger ------------------
-def update_ui_with_columns(topic, files, urls, sentiment_filters):
     extracted_articles = []
-    if topic.strip():
-        articles = fetch_content(topic)
-        if articles:
-            extracted_articles.extend(extract_summarize_and_analyze_articles(articles))
-    if files:
-        extracted_articles.extend(extract_summarize_and_analyze_content_from_file(files))
     if urls:
         url_list = [url.strip() for url in urls.split("\n") if url.strip()]
         extracted_articles.extend(extract_summarize_and_analyze_content_from_urls(url_list))
     if not extracted_articles:
-        return sentiment_filters, "### No content to display", "", "", "", "", None
-    df = pd.DataFrame(extracted_articles)
     result = cluster_news.cluster_and_label_articles(df, content_column="content", summary_column="summary")
-    cluster_md_blocks = display_clusters_as_columns(result, sentiment_filters)
     csv_file, _ = save_clustered_articles(result["dataframe"], topic or "batch_upload")
-    return sentiment_filters, *cluster_md_blocks, csv_file
 def clear_interface():
     return (
         "",                                 # topic_input
         ["Positive", "Neutral", "Negative"],# sentiment_filter
-        gr.update(value=None),              # uploaded_files (reset file upload)
         "",                                 # urls_input
         "", "", "", "", "",                 # cluster columns 0–4
-        gr.update(value=None)               # csv_output (reset download file)
     )
-# ------------------ Gradio UI ------------------
-with gr.Blocks(theme=gr.themes.Base(), css=".gr-markdown { margin: 10px; }") as demo:
-    # Header Section
-    gr.Markdown("# 📰 Quick Pulse")
-    gr.Markdown("### AI-Powered News Summarization with Real-Time Sentiment and Topic Insights")
     gr.Markdown(
-        "From headlines to insight, Quick Pulse summarizes news stories, captures emotional context, and clusters related topics to provide structured intelligence—faster than ever")
-    # Input Section
-    gr.Markdown("---")  # Horizontal line for separation
-    with gr.Accordion("🗞️ Latest Top Headlines", open=False):
-        latest_news_button = gr.Button("Fetch & Summarize Top 10 Headlines")
-    with gr.Row():
-        topic_input = gr.Textbox(label="Enter Topic", placeholder="e.g. climate change")
-        sentiment_filter = gr.CheckboxGroup(choices=["Positive", "Neutral", "Negative"], value=["Positive", "Neutral", "Negative"], label="Sentiment Filter")
-        csv_output = gr.File(label="📁 Download Clustered Digest CSV")
-    with gr.Accordion("📂 Upload Articles (.txt files)", open=False):
-        uploaded_files = gr.File(label="Upload .txt Files", file_types=[".txt"], file_count="multiple")
-    with gr.Accordion("🔗 Enter Multiple URLs", open=False):
-        urls_input = gr.Textbox(label="Enter URLs (newline separated)", lines=4)
-    with gr.Row():
-        submit_button = gr.Button(" Generate Digest")
-        clear_button = gr.Button(" Clear")
     with gr.Row():
-        column_0 = gr.Markdown()
-        column_1 = gr.Markdown()
-        column_2 = gr.Markdown()
-        column_3 = gr.Markdown()
-        column_4 = gr.Markdown()
     submit_button.click(
         fn=update_ui_with_columns,
-        inputs=[topic_input, uploaded_files, urls_input, sentiment_filter],
         outputs=[
             sentiment_filter,
             column_0, column_1, column_2, column_3, column_4,
-            csv_output
         ]
     )
@@ -240,28 +279,23 @@ with gr.Blocks(theme=gr.themes.Base(), css=".gr-markdown { margin: 10px; }") as
         outputs=[
             sentiment_filter,
             column_0, column_1, column_2, column_3, column_4,
-            csv_output
         ]
     )
     clear_button.click(
-    fn=clear_interface,
-    inputs=[],
-    outputs=[
-        topic_input,          # 1
-        sentiment_filter,     # 2
-        uploaded_files,       # 3
-        urls_input,           # 4
-        column_0,             # 5
-        column_1,             # 6
-        column_2,             # 7
-        column_3,             # 8
-        column_4,             # 9
-        csv_output            # 10
-    ]
-)
 if __name__ == "__main__":
-    demo.launch()

+## This script provides a Gradio interface for gathering, clustering, summarizing, and analyzing news articles with sentiment analysis and topic modeling.
+import gather_news
 import pandas as pd
 import cluster_news
 import summarizer
 import analyze_sentiment
+import extract_news
+import gradio as gr
+import plotly.express as px
+def plot_topic_frequency(result):
+    df = result["dataframe"]
+    topic_counts = df["cluster_label"].value_counts().reset_index()
+    topic_counts.columns = ["Topic", "Count"]
+    fig = px.bar(topic_counts, x="Topic", y="Count", title="Topic Frequency", color="Topic")
+    fig.update_layout(showlegend=False, height=350)
+    return fig
+def plot_sentiment_trends(result):
+    df = result["dataframe"]
+    sentiment_counts = df["sentiment"].value_counts().reset_index()
+    sentiment_counts.columns = ["Sentiment", "Count"]
+    fig = px.pie(sentiment_counts, names="Sentiment", values="Count", title="Sentiment Distribution")
+    fig.update_traces(textinfo='label+percent')
+    fig.update_layout(height=350)
+    return fig
+def render_top_clusters_table(result, top_n=5):
+    df = result["dataframe"]
+    cluster_counts = df["cluster_label"].value_counts().reset_index()
+    cluster_counts.columns = ["Cluster", "Articles"]
+    top_clusters = cluster_counts.head(top_n)
+    return top_clusters
 def fetch_and_process_latest_news(sentiment_filters):
+    articles = gather_news.fetch_newsapi_top_headlines()
+    return process_and_display_articles(articles, sentiment_filters, "Top Headlines")
+def fetch_and_process_topic_news(topic, sentiment_filters):
+    articles = gather_news.fetch_newsapi_everything(topic)
+    return process_and_display_articles(articles, sentiment_filters, topic or "Topic")
+def process_and_display_articles(articles, sentiment_filters, topic_label):
+    if not articles:
+        return sentiment_filters, "", "", "", "", "", None, None, None, gr.update(visible=False)
+    articles = sorted(articles, key=lambda x: x.get("publishedAt", ""), reverse=True)
+    extracted_articles = extract_summarize_and_analyze_articles(articles)
+    deduped_articles = deduplicate_articles(extracted_articles)
+    if not deduped_articles:
+        return sentiment_filters, "", "", "", "", "", None, None, None, gr.update(visible=False)
+    df = pd.DataFrame(deduped_articles)
     result = cluster_news.cluster_and_label_articles(df, content_column="content", summary_column="summary")
+    cluster_md_blocks = display_clusters_as_columns_grouped_by_sentiment(result, sentiment_filters)
+    csv_file, _ = save_clustered_articles(result["dataframe"], topic_label)
+    # Analytics
+    topic_fig = plot_topic_frequency(result)
+    sentiment_fig = plot_sentiment_trends(result)
+    top_clusters_table = render_top_clusters_table(result)
+    return sentiment_filters, *cluster_md_blocks, csv_file, topic_fig, sentiment_fig, top_clusters_table, gr.update(visible=True)
 def extract_summarize_and_analyze_articles(articles):
     extracted_articles = []
     for article in articles:
+        content = article.get("text") or article.get("content")
+        if not content:
+            continue
+        title = article.get("title", "No title")
+        summary = summarizer.generate_summary(content)
+        sentiment, score = analyze_sentiment.analyze_summary(summary)
+        extracted_articles.append({
+            "title": title,
+            "url": article.get("url"),
+            "source": article.get("source", "Unknown"),
+            "author": article.get("author", "Unknown"),
+            "publishedAt": article.get("publishedAt", "Unknown"),
+            "content": content,
+            "summary": summary,
+            "sentiment": sentiment,
+            "score": score
+        })
     return extracted_articles
+def deduplicate_articles(articles):
+    seen_urls = set()
+    seen_title_source = set()
+    seen_title_summary = set()
+    deduped = []
+    for art in articles:
+        url = art.get("url")
+        title = art.get("title", "").strip().lower()
+        source = art.get("source", "").strip().lower()
+        summary = art.get("summary", "").strip().lower()
+        key_title_source = (title, source)
+        key_title_summary = (title, summary)
+        if url and url in seen_urls:
+            continue
+        if key_title_source in seen_title_source:
+            continue
+        if key_title_summary in seen_title_summary:
+            continue
+        deduped.append(art)
+        if url:
+            seen_urls.add(url)
+        seen_title_source.add(key_title_source)
+        seen_title_summary.add(key_title_summary)
+    return deduped
 def extract_summarize_and_analyze_content_from_urls(urls):
+    articles = extract_news.extract_news_articles(urls)
+    return extract_summarize_and_analyze_articles(articles)
+def display_clusters_as_columns_grouped_by_sentiment(result, sentiment_filters=None):
     df = result["dataframe"]
+    cluster_primary_topics = result.get("cluster_primary_topics", {})
+    cluster_related_topics = result.get("cluster_related_topics", {})
     df["sentiment"] = df["sentiment"].str.capitalize()
     if sentiment_filters:
     markdown_blocks = []
     for cluster_label, articles in clusters:
+        cluster_md = f"<div style='border:2px solid #e0e0e0; border-radius:10px; margin-bottom:18px; padding:18px; background: #f9f9fa;'>"
+        cluster_md += f"<h3 style='color:#2d6cdf;'>🧩 Cluster: {cluster_label}</h3>"
+        lda_topics = articles["lda_topics"].iloc[0] if "lda_topics" in articles else ""
+        if lda_topics:
+            cluster_md += f"<b style='color:#0d47a1;'>Main Themes:</b> <span style='color:#1976d2'>{lda_topics}</span><br>"
+        primary = cluster_primary_topics.get(cluster_label, [])
+        if primary:
+            cluster_md += f"<b style='color:#1b5e20;'>Primary Topics:</b> <span style='color:#388e3c'>{', '.join(primary)}</span><br>"
+        related = cluster_related_topics.get(cluster_label, [])
+        if related:
+            cluster_md += f"<b style='color:#616161;'>Related Topics:</b> <span style='color:#757575'>{', '.join(related)}</span><br>"
+        cluster_md += f"<b>Articles:</b> {len(articles)}<br><br>"
+        for sentiment in ["Positive", "Neutral", "Negative"]:
+            sentiment_articles = articles[articles["sentiment"] == sentiment]
+            if not sentiment_articles.empty:
+                color = {"Positive": "#e8f5e9", "Neutral": "#e3f2fd", "Negative": "#ffebee"}[sentiment]
+                border = {"Positive": "#43a047", "Neutral": "#1976d2", "Negative": "#c62828"}[sentiment]
+                sentiment_label = {
+                    "Positive": "Positive News",
+                    "Neutral": "Neutral News",
+                    "Negative": "Negative News"
+                }[sentiment]
+                cluster_md += (
+                    f"<div style='background:{color}; border-left:6px solid {border}; border-radius:6px; margin-bottom:10px; padding:10px;'>"
+                    f"<span style='font-size:1.2em;'><b>{sentiment_label} ({len(sentiment_articles)})</b></span><br>"
+                )
+                for _, article in sentiment_articles.iterrows():
+                    cluster_md += (
+                        f"<div style='margin:10px 0 10px 0; padding:10px; border-bottom:1px solid #e0e0e0;'>"
+                        f"<span style='font-weight:bold; color:#37474f;'>📰 {article['title']}</span><br>"
+                        f"<span style='font-size:0.95em;'>"
+                        f"<b>Source:</b> {article['source']}<br>"
+                        f"<details><summary style='cursor:pointer; color:#1976d2;'><strong>Summary</strong></summary>"
+                        f"<div style='margin-left:10px; color:#424242;'>{article['summary']}</div></details>"
+                        f"<a href='{article['url']}' target='_blank' style='color:#1976d2;'>Read Full Article</a>"
+                        f"</span></div>"
+                    )
+                cluster_md += "</div>"
+        cluster_md += "</div>"
         markdown_blocks.append(cluster_md)
     while len(markdown_blocks) < 5:
     df.to_csv(csv_file, index=False)
     return csv_file, None
+def update_ui_with_columns(topic, urls, sentiment_filters):
     extracted_articles = []
+    if topic and topic.strip():
+        return fetch_and_process_topic_news(topic, sentiment_filters)
     if urls:
         url_list = [url.strip() for url in urls.split("\n") if url.strip()]
         extracted_articles.extend(extract_summarize_and_analyze_content_from_urls(url_list))
     if not extracted_articles:
+        return sentiment_filters, "", "", "", "", "", None, None, None, gr.update(visible=False)
+    deduped_articles = deduplicate_articles(extracted_articles)
+    df = pd.DataFrame(deduped_articles)
     result = cluster_news.cluster_and_label_articles(df, content_column="content", summary_column="summary")
+    cluster_md_blocks = display_clusters_as_columns_grouped_by_sentiment(result, sentiment_filters)
     csv_file, _ = save_clustered_articles(result["dataframe"], topic or "batch_upload")
+    topic_fig = plot_topic_frequency(result)
+    sentiment_fig = plot_sentiment_trends(result)
+    top_clusters_table = render_top_clusters_table(result)
+    return sentiment_filters, *cluster_md_blocks, csv_file, topic_fig, sentiment_fig, top_clusters_table, gr.update(visible=True)
 def clear_interface():
     return (
         "",                                 # topic_input
         ["Positive", "Neutral", "Negative"],# sentiment_filter
         "",                                 # urls_input
         "", "", "", "", "",                 # cluster columns 0–4
+        gr.update(value=None),              # csv_output (reset download file)
+        None, None, None,                   # topic_fig, sentiment_fig, top_clusters_table
+        gr.update(visible=False)            # Hide Clustered News Digest section
     )
+with gr.Blocks(theme=gr.themes.Base(), css="""
+.gr-markdown { margin: 10px; }
+.analytics-card {background: #f5f7fa; border-radius: 10px; padding: 18px; margin-bottom: 18px;}
+""") as demo:
     gr.Markdown(
+        "<h1 style='text-align:center;'>📰 Quick Pulse</h1>"
+        "<h3 style='text-align:center; color:#1976d2;'>AI-Powered News Summarization with Real-Time Sentiment and Topic Insights</h3>"
+        "<p style='text-align:center;'>From headlines to insight, Quick Pulse summarizes news stories, captures emotional context, clusters related topics, and provides analytics at a glance.</p>"
+    )
     with gr.Row():
+        with gr.Column(scale=2):
+            topic_input = gr.Textbox(label="Enter Topic", placeholder="e.g. climate change")
+            sentiment_filter = gr.CheckboxGroup(choices=["Positive", "Neutral", "Negative"], value=["Positive", "Neutral", "Negative"], label="Sentiment Filter")
+            with gr.Accordion("🔗 Enter Multiple URLs", open=False):
+                urls_input = gr.Textbox(label="Enter URLs (newline separated)", lines=4)
+            with gr.Row():
+                submit_button = gr.Button(" Generate Digest", scale=1)
+                latest_news_button = gr.Button("Fetch & Summarize Top News", scale=1)
+                clear_button = gr.Button(" Clear", scale=1)
+            csv_output = gr.File(label="📁 Download Clustered Digest CSV")
+        with gr.Column(scale=3):
+            with gr.Row():
+                topic_fig = gr.Plot(label="Topic Frequency")
+                sentiment_fig = gr.Plot(label="Sentiment Trends")
+            top_clusters_table = gr.Dataframe(label="Top Clusters")
+    gr.Markdown("---")
+    clustered_digest_section = gr.Group(visible=False)
+    with clustered_digest_section:
+        gr.Markdown("<h3 style='color:#1976d2;'>Clustered News Digest</h3>")
+        with gr.Row():
+            column_0 = gr.Markdown()
+            column_1 = gr.Markdown()
+            column_2 = gr.Markdown()
+            column_3 = gr.Markdown()
+            column_4 = gr.Markdown()
     submit_button.click(
         fn=update_ui_with_columns,
+        inputs=[topic_input, urls_input, sentiment_filter],
         outputs=[
             sentiment_filter,
             column_0, column_1, column_2, column_3, column_4,
+            csv_output,
+            topic_fig, sentiment_fig, top_clusters_table,
+            clustered_digest_section
         ]
     )
         outputs=[
             sentiment_filter,
             column_0, column_1, column_2, column_3, column_4,
+            csv_output,
+            topic_fig, sentiment_fig, top_clusters_table,
+            clustered_digest_section
         ]
     )
     clear_button.click(
+        fn=clear_interface,
+        inputs=[],
+        outputs=[
+            topic_input, sentiment_filter, urls_input,
+            column_0, column_1, column_2, column_3, column_4,
+            csv_output,
+            topic_fig, sentiment_fig, top_clusters_table,
+            clustered_digest_section
+        ]
+    )
 if __name__ == "__main__":
+    demo.launch()

cluster_news.py CHANGED Viewed

@@ -1,224 +1,169 @@
 import numpy as np
 import pandas as pd
-from sklearn.cluster import KMeans
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.decomposition import LatentDirichletAllocation
-from sklearn.metrics import silhouette_score
 from collections import defaultdict
 from sentence_transformers import SentenceTransformer
-from sklearn.metrics.pairwise import cosine_similarity
 def generate_embeddings(df, content_column):
-    """
-    Generate embeddings for the content using SentenceTransformer.
-    """
-    print("🔢 Generating embeddings for clustering...")
     model = SentenceTransformer('all-MiniLM-L6-v2')
     embeddings = model.encode(df[content_column].tolist(), show_progress_bar=True)
-    return embeddings
-def determine_optimum_clusters(embeddings, min_clusters=2, max_clusters=10):
-    """
-    Determine the optimum number of clusters using silhouette analysis.
-    """
-    print("🔍 Determining the optimum number of clusters using silhouette analysis...")
-    n_samples = len(embeddings)
-    if n_samples < 2:
-        raise ValueError("Not enough samples to perform clustering. At least 2 samples are required.")
-    # Adjust max_clusters to ensure it does not exceed n_samples - 1
-    max_clusters = min(max_clusters, n_samples - 1)
-    best_num_clusters = min_clusters
-    best_score = -1
-    for n_clusters in range(min_clusters, max_clusters + 1):
-        try:
-            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
-            cluster_labels = kmeans.fit_predict(embeddings)
-            score = silhouette_score(embeddings, cluster_labels)
-            print(f"Number of clusters: {n_clusters}, Silhouette Score: {score:.4f}")
-            if score > best_score:
-                best_score = score
-                best_num_clusters = n_clusters
-        except ValueError as e:
-            print(f"Skipping {n_clusters} clusters due to error: {e}")
-    print(f"✅ Optimum number of clusters determined: {best_num_clusters}")
-    return best_num_clusters
-def cluster_embeddings(embeddings, num_clusters):
-    """
-    Perform KMeans clustering on the embeddings.
-    """
-    print(f"📊 Clustering articles into {num_clusters} clusters using KMeans...")
-    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
-    kmeans.fit(embeddings)
-    return kmeans.labels_, kmeans
-def extract_tfidf_labels(df, content_column, cluster_labels):
-    """
-    Extract top TF-IDF keywords for each cluster.
-    """
-    print("🔠 Extracting TF-IDF-based keywords for cluster labels...")
     grouped = defaultdict(list)
     for idx, label in enumerate(cluster_labels):
         grouped[label].append(df.iloc[idx][content_column])
     tfidf_labels = {}
     for cluster_id, texts in grouped.items():
         vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=50)
         tfidf_matrix = vectorizer.fit_transform(texts)
         avg_tfidf = tfidf_matrix.mean(axis=0).A1
-        top_indices = np.argsort(avg_tfidf)[::-1][:3]
         top_terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
-        tfidf_labels[cluster_id] = ", ".join(top_terms)
     return tfidf_labels
-def apply_topic_modeling(df, content_column, cluster_labels, num_topics=2):
-    """
-    Apply topic modeling (LDA) within each cluster to refine and describe topics.
-    """
-    print("🔍 Applying topic modeling within each cluster...")
-    grouped = defaultdict(list)
-    for idx, label in enumerate(cluster_labels):
-        grouped[label].append(df.iloc[idx][content_column])
-    topic_labels = {}
-    for cluster_id, texts in grouped.items():
-        vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=5000)
-        tfidf_matrix = vectorizer.fit_transform(texts)
-        lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
-        lda.fit(tfidf_matrix)
-        # Extract top words for each topic
-        feature_names = vectorizer.get_feature_names_out()
-        topics = []
-        for topic_idx, topic in enumerate(lda.components_):
-            top_indices = topic.argsort()[:-4:-1]
-            topics.append(", ".join([feature_names[i] for i in top_indices]))
-        topic_labels[cluster_id] = " | ".join(topics)
-    return topic_labels
-def filter_similar_topics(topic_keywords_list, threshold=0.75):
-    """
-    Filter out similar topics based on cosine similarity of their embeddings.
-    """
-    print("🔄 Filtering similar topics...")
-    model = SentenceTransformer('all-MiniLM-L6-v2')
-    topic_sentences = [", ".join(kw) for kw in topic_keywords_list]
-    embeddings = model.encode(topic_sentences)
-    unique_indices = []
-    for i, emb in enumerate(embeddings):
-        if all(cosine_similarity([emb], [embeddings[j]])[0][0] < threshold for j in unique_indices):
-            unique_indices.append(i)
-    return [topic_keywords_list[i] for i in unique_indices]
-def get_representative_summaries(df, summary_column, embeddings, cluster_labels, kmeans):
-    """
-    Get the most representative summary for each cluster based on proximity to the cluster centroid.
-    """
-    print("🔄 Refining cluster labels using representative summaries...")
-    representatives = {}
-    for i in range(kmeans.n_clusters):
-        indices = [j for j, label in enumerate(cluster_labels) if label == i]
-        if not indices:
             continue
-        cluster_embeddings = embeddings[indices]
-        centroid = kmeans.cluster_centers_[i]
-        distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
-        closest_idx = indices[np.argmin(distances)]
-        representatives[i] = df.iloc[closest_idx][summary_column]
-    return representatives
-def cluster_and_label_articles(df, content_column="content", summary_column="summary", min_clusters=2, max_clusters=10, max_topics=3):
-    """
-    Cluster articles using SentenceTransformer embeddings and label clusters using TF-IDF and Topic Modeling.
-    Display detected topics for each cluster with Primary focus and Related topics.
-    """
     if df.empty:
-        print("No articles to cluster.")
         return None
-    # Step 1: Generate embeddings
-    embeddings = generate_embeddings(df, content_column)
-    # Step 2: Determine the optimum number of clusters
-    num_clusters = determine_optimum_clusters(embeddings, min_clusters, max_clusters)
-    # Step 3: Perform clustering
-    cluster_labels, kmeans = cluster_embeddings(embeddings, num_clusters)
-    df['cluster_label'] = cluster_labels
-    # Step 4: Extract TF-IDF matrix
-    print("🔠 Extracting TF-IDF matrix for clusters...")
-    vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=5000)
-    tfidf_matrix = vectorizer.fit_transform(df[content_column].tolist())
-    feature_names = vectorizer.get_feature_names_out()
-    # Step 5: Process each cluster
-    print("🔍 Processing clusters for TF-IDF and topic modeling...")
-    grouped = defaultdict(list)
-    for idx, label in enumerate(cluster_labels):
-        grouped[label].append(idx)
-    refined_labels = [""] * num_clusters  # Initialize refined_labels with empty strings
-    detected_topics = {}
-    for cluster_id, indices in grouped.items():
-        cluster_texts = tfidf_matrix[indices]
-        # Extract TF-IDF keywords
-        avg_tfidf = cluster_texts.mean(axis=0).A1
-        top_indices = np.argsort(avg_tfidf)[::-1][:3]
-        tfidf_keywords = [feature_names[i] for i in top_indices]
-        # Generate a cluster label using the top TF-IDF keywords
-        cluster_label_tfidf = ", ".join(tfidf_keywords)
-        # Apply topic modeling
-        lda = LatentDirichletAllocation(n_components=min(max_topics, len(indices)), random_state=42)
-        lda.fit(cluster_texts)
-        topics = []
-        topic_weights = []
-        for topic_idx, topic in enumerate(lda.components_):
-            top_topic_indices = topic.argsort()[:-4:-1]
-            topics.append(", ".join([feature_names[i] for i in top_topic_indices]))
-            topic_weights.append(topic.sum())  # Sum of weights for ranking
-        # Rank topics by importance
-        ranked_topics = [x for _, x in sorted(zip(topic_weights, topics), reverse=True)]
-        # Generate Primary focus and Related topics
-        primary_focus = ranked_topics[0] if ranked_topics else "N/A"
-        related_topics = ranked_topics[1:] if len(ranked_topics) > 1 else []
-        # Store detected topics for user display
-        detected_topics[cluster_label_tfidf] = {
-            "primary_focus": primary_focus,
-            "related_topics": related_topics,
         }
-        # Assign the TF-IDF keywords as the cluster label
-        refined_labels[cluster_id] = cluster_label_tfidf
-    # Assign refined labels to clusters
-    df['cluster_label'] = [refined_labels[label] for label in cluster_labels]
-    print("✅ Clustering and labeling complete!")
     return {
         "dataframe": df,
         "detected_topics": detected_topics,
-        "number_of_clusters": num_clusters,
     }

+# cluster_news.py
+# Clusters news articles using HDBSCAN, labels clusters with TF-IDF n-grams and LDA topics,
+# and falls back to a representative summary if the label is too vague.
 import numpy as np
 import pandas as pd
 from collections import defaultdict
 from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from sklearn.metrics.pairwise import cosine_distances
+from sklearn.decomposition import LatentDirichletAllocation
+import hdbscan
+import umap
 def generate_embeddings(df, content_column):
     model = SentenceTransformer('all-MiniLM-L6-v2')
     embeddings = model.encode(df[content_column].tolist(), show_progress_bar=True)
+    return np.array(embeddings)
+def reduce_dimensions(embeddings, n_neighbors=10, min_dist=0.0, n_components=5, random_state=42):
+    n_samples = embeddings.shape[0]
+    if n_samples < 3:
+        return embeddings
+    n_components = min(max(2, n_components), n_samples - 2)
+    n_neighbors = min(max(2, n_neighbors), n_samples - 1)
+    reducer = umap.UMAP(
+        n_neighbors=n_neighbors,
+        min_dist=min_dist,
+        n_components=n_components,
+        random_state=random_state,
+        metric='cosine'
+    )
+    reduced = reducer.fit_transform(embeddings)
+    return reduced
+def cluster_with_hdbscan(embeddings, min_cluster_size=2, min_samples=1):
+    clusterer = hdbscan.HDBSCAN(
+        min_cluster_size=min_cluster_size,
+        min_samples=min_samples,
+        metric='euclidean'
+    )
+    labels = clusterer.fit_predict(embeddings)
+    return labels, clusterer
+def extract_tfidf_labels(df, content_column, cluster_labels, top_n=6):
     grouped = defaultdict(list)
     for idx, label in enumerate(cluster_labels):
+        if label == -1: continue
         grouped[label].append(df.iloc[idx][content_column])
     tfidf_labels = {}
     for cluster_id, texts in grouped.items():
         vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=50)
         tfidf_matrix = vectorizer.fit_transform(texts)
         avg_tfidf = tfidf_matrix.mean(axis=0).A1
+        if len(avg_tfidf) == 0:
+            tfidf_labels[cluster_id] = []
+            continue
+        top_indices = np.argsort(avg_tfidf)[::-1][:top_n]
         top_terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
+        tfidf_labels[cluster_id] = top_terms
     return tfidf_labels
+def lda_topic_modeling(texts, n_topics=1, n_words=6):
+    vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_features=1000)
+    X = vectorizer.fit_transform(texts)
+    if X.shape[0] < n_topics:
+        n_topics = max(1, X.shape[0])
+    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
+    lda.fit(X)
+    topic_words = []
+    for topic_idx, topic in enumerate(lda.components_):
+        top_indices = topic.argsort()[:-n_words - 1:-1]
+        words = [vectorizer.get_feature_names_out()[i] for i in top_indices]
+        topic_words.extend(words)
+    return topic_words
+def get_representative_summary(df, cluster_indices, embeddings, centroid):
+    cluster_embs = embeddings[cluster_indices]
+    dists = cosine_distances(cluster_embs, centroid.reshape(1, -1)).flatten()
+    min_idx = np.argmin(dists)
+    return df.iloc[cluster_indices[min_idx]]["summary"]
+def label_clusters_hybrid(df, content_column, summary_column, cluster_labels, embeddings, tfidf_labels, lda_labels, vague_threshold=15):
+    cluster_label_map = {}
+    cluster_primary_topics = {}
+    cluster_related_topics = {}
+    for cluster_id in set(cluster_labels):
+        if cluster_id == -1:
             continue
+        topics = lda_labels.get(cluster_id, []) or tfidf_labels.get(cluster_id, [])
+        topics = [t for t in topics if t]
+        primary_topics = topics[:3]
+        related_topics = topics[3:]
+        label = ", ".join(primary_topics) if primary_topics else ""
+        if not label or len(label) < vague_threshold:
+            cluster_indices = np.where(cluster_labels == cluster_id)[0]
+            centroid = embeddings[cluster_indices].mean(axis=0)
+            rep_summary = get_representative_summary(df, cluster_indices, embeddings, centroid)
+            label = rep_summary[:80] + "..." if len(rep_summary) > 80 else rep_summary
+        cluster_label_map[cluster_id] = label
+        cluster_primary_topics[cluster_id] = primary_topics
+        cluster_related_topics[cluster_id] = related_topics
+    return cluster_label_map, cluster_primary_topics, cluster_related_topics
+def cluster_and_label_articles(
+    df,
+    content_column="content",
+    summary_column="summary",
+    min_cluster_size=2,
+    min_samples=1,
+    n_neighbors=10,
+    min_dist=0.0,
+    n_components=5,
+    top_n=6,
+    lda_n_topics=1,
+    lda_n_words=6,
+    vague_threshold=15
+):
     if df.empty:
         return None
+    min_cluster_size = max(2, min(min_cluster_size, len(df) // 2)) if len(df) < 20 else min_cluster_size
+    embeddings = generate_embeddings(df, content_column)
+    reduced_embeddings = reduce_dimensions(embeddings, n_neighbors, min_dist, n_components)
+    cluster_labels, clusterer = cluster_with_hdbscan(reduced_embeddings, min_cluster_size, min_samples)
+    df['cluster_id'] = cluster_labels
+    tfidf_labels = extract_tfidf_labels(df, content_column, cluster_labels, top_n=top_n)
+    lda_labels = {}
+    for cluster_id in set(cluster_labels):
+        if cluster_id == -1:
+            continue
+        cluster_texts = df[cluster_labels == cluster_id][content_column].tolist()
+        if cluster_texts:
+            topics = lda_topic_modeling(
+                cluster_texts, n_topics=lda_n_topics, n_words=lda_n_words
+            )
+            lda_labels[cluster_id] = topics
+        else:
+            lda_labels[cluster_id] = []
+    cluster_label_map, cluster_primary_topics, cluster_related_topics = label_clusters_hybrid(
+        df, content_column, summary_column, cluster_labels, embeddings, tfidf_labels, lda_labels, vague_threshold=vague_threshold
+    )
+    df['cluster_label'] = [
+        cluster_label_map.get(cid, "Noise/Other") if cid != -1 else "Noise/Other"
+        for cid in cluster_labels
+    ]
+    df['lda_topics'] = [
+        ", ".join(lda_labels.get(cid, [])) if cid != -1 else "" for cid in cluster_labels
+    ]
+    detected_topics = {
+        label: {
+            "size": int((df['cluster_label'] == label).sum())
         }
+        for label in set(df['cluster_label']) if label != "Noise/Other"
+    }
     return {
         "dataframe": df,
         "detected_topics": detected_topics,
+        "number_of_clusters": len(detected_topics),
+        "cluster_primary_topics": cluster_primary_topics,
+        "cluster_related_topics": cluster_related_topics
     }

extract_news.py CHANGED Viewed

@@ -1,244 +1,36 @@
 # extract_news.py
-# This script is designed to extract news articles from various sources, including NewsAPI and Google News RSS using the URLs saved from the gather_news.py file.
-# It includes functions for extracting  clean,full-text content from the articles, and storing the metadata into a file.
-# Article Scraping & Text Extraction
-from newspaper import Article
-import pandas as pd
 import logging
-import requests
-from bs4 import BeautifulSoup
-# * For each URL from NewsAPI or RSS, * Create Article(url)* Call .download(), .parse(), .text and * Optionally use .nlp() to get summary and keywords
-def extract_full_content(url, min_length=300):
-    """
-    Extract full content and title from the given URL using newspaper3k.
-    Always returns a tuple (content, title) or (None, None).
-    """
     try:
         article = Article(url)
         article.download()
         article.parse()
         text = article.text.strip()
         title = article.title.strip() if article.title else "Untitled"
-        # Filter out short content
         if len(text) < min_length:
             logging.warning(f"Extracted content is too short from {url}.")
-            return None, None
-        return text, title
     except Exception as e:
         logging.error(f"Failed to extract content from {url}: {str(e)}")
-        return None, None
-def extract_full_content_rss(url, min_length=300):
-    """
-    Extract full content and title from an RSS article using BeautifulSoup.
-    Always returns a tuple: (text, title) or (None, None).
-    """
-    try:
-        response = requests.get(url, timeout=10)
-        if response.status_code != 200:
-            logging.error(f"Error fetching URL {url}: {response.status_code}")
-            return None, None
-        soup = BeautifulSoup(response.content, 'html.parser')
-        title = soup.title.string.strip() if soup.title and soup.title.string else "Untitled"
-        paragraphs = soup.find_all('p')
-        text = ' '.join([para.get_text() for para in paragraphs]).strip()
-        if len(text) < min_length:
-            logging.warning(f"Extracted content is too short from {url}.")
-            return None, None
-        return text, title
-    except Exception as e:
-        logging.error(f"Error extracting content from {url}: {str(e)}")
-        return None, None
-# * Handle common edge cases such as * Paywalled content (skip or tag) and * Duplicate links or broken URLs
-def is_paywalled(url):
-    """
-    * Check if the URL is paywalled
-    """
-    paywall_indicators = ['paywall', 'subscription', 'premium']
-    return any(indicator in url for indicator in paywall_indicators)
-def is_paywalled_content(article):
-    """
-    * Check if the article is paywalled
-    """
-    if not article:
-        return False
-    if not article.get("text"):
-        return False
-    if is_paywalled(article.get("url", "")):
-        return True
-    return False
-def is_duplicate(url, existing_urls):
-    """
-    * Check if the URL is a duplicate
-    """
-    return url in existing_urls
-def is_broken(url):
-    """
-    * Check if the URL is broken
-    """
-    try:
-        response = requests.head(url, allow_redirects=True)
-        return response.status_code != 200
-    except requests.RequestException:
-        return True
-def is_valid_url(url):
-    """
-    * Check if the URL is valid
-    """
-    regex = re.compile(
-        r'^(?:http|ftp)s?://'  # http:// or https://
-        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
-        r'localhost|'  # localhost...
-        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'  # ...or ipv4
-        r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'  # ...or ipv6
-        r'(?::\d+)?'  # optional port
-        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
-    return re.match(regex, url) is not None
-def is_valid_url_content(url):
-    """
-    * Check if the URL is valid
-    """
-    if not url:
-        return False
-    if not is_valid_url(url):
-        return False
-    if is_paywalled(url):
-        return False
-    if is_broken(url):
-        return False
-    return True
-# Additional functions to check if the article have empty content or blocked sites
-def is_empty_content(article):
-    """
-    * Check if the article content is empty
-    """
-    if not article:
-        return True
-    if not article.get("text"):
-        return True
-    return False
-def is_blocked_site(url):
-    """
-    * Check if the URL is from a blocked site
-    """
-    blocked_sites = ['example.com', 'blockedsite.com']  # Add your blocked sites here
-    return any(blocked_site in url for blocked_site in blocked_sites)
-def is_blocked_content(article):
-    """
-    * Check if the article is from a blocked site
-    """
-    if not article:
-        return False
-    if not article.get("text"):
-        return False
-    if is_blocked_site(article.get("url", "")):
-        return True
-    return False
-#  Extract news articles from the given URLs
-def extract_news_articles(urls):
-    """
-    * Extract news articles from the given URLs
-    """
     extracted_articles = []
-    existing_urls = set()
     for url in urls:
-        if not is_valid_url_content(url):
-            logging.warning(f"Skipping invalid or paywalled URL: {url}")
-            continue
-        if is_duplicate(url, existing_urls):
-            logging.warning(f"Skipping duplicate URL: {url}")
-            continue
-        existing_urls.add(url)
-        article = extract_full_content(url)
-        if not article:
-            logging.warning(f"Failed to extract content from {url}")
-            continue
-        if is_paywalled_content(article):
-            logging.warning(f"Skipping paywalled content from URL: {url}")
-            continue
-        extracted_articles.append(article)
-    return extracted_articles
-def extract_news_articles_rss(urls):
-    """
-    * Extract news articles from the given RSS URLs
-    """
-    extracted_articles = []
-    existing_urls = set()
-    for url in urls:
-        if not is_valid_url_content(url):
-            logging.warning(f"Skipping invalid or paywalled URL: {url}")
-            continue
-        if is_duplicate(url, existing_urls):
-            logging.warning(f"Skipping duplicate URL: {url}")
-            continue
-        existing_urls.add(url)
-        article = extract_full_content_rss(url)
-        if not article:
-            logging.warning(f"Failed to extract content from {url}")
-            continue
-        if is_paywalled_content(article):
-            logging.warning(f"Skipping paywalled content from URL: {url}")
-            continue
-        extracted_articles.append(article)
     return extracted_articles
-# Metadata Structuring and Storage
-# Functions to create a dataframe with all the metadata for extracted fields title,url,source,author, published_at and full_text for each extracted article and save it to a csv file
 def create_dataframe(articles):
-    """
-    Create a pandas DataFrame from the list of articles.
-    """
     return pd.DataFrame(articles)
 def save_to_csv(df, filename):
-    """
-    Save the DataFrame to a CSV file.
-    """
-    df.to_csv(filename, index=False)
-def save_to_json(df, filename):
-    """
-    Save the DataFrame to a JSON file.
-    """
-    df.to_json(filename, orient="records", lines=True)

 # extract_news.py
+# This script extracts full content from news articles using the newspaper3k library.
 import logging
+import pandas as pd
+from newspaper import Article
+def extract_full_content(url, min_length=100):
     try:
         article = Article(url)
         article.download()
         article.parse()
         text = article.text.strip()
         title = article.title.strip() if article.title else "Untitled"
         if len(text) < min_length:
             logging.warning(f"Extracted content is too short from {url}.")
+            return None
+        return {"url": url, "text": text, "title": title}
     except Exception as e:
         logging.error(f"Failed to extract content from {url}: {str(e)}")
+        return None
+def extract_news_articles(urls, min_length=100):
     extracted_articles = []
     for url in urls:
+        article = extract_full_content(url, min_length=min_length)
+        if article and article.get("text"):
+            article["original_url"] = url
+            extracted_articles.append(article)
     return extracted_articles
 def create_dataframe(articles):
     return pd.DataFrame(articles)
 def save_to_csv(df, filename):
+    df.to_csv(filename, index=False)

gather_news.py CHANGED Viewed

@@ -1,73 +1,121 @@
 # gather_news.py
 # News Source Integration
-# This script integrates with various news sources to fetch the latest articles from the specified news sources, extracts relevant information such as title, URL,Source,Author and Publish date.
 import requests
-import feedparser
-import os
-def fetch_articles_newsapi(topic):
-    """
-    Fetch articles from NewsAPI based on the provided topic.
-    """
     url = 'https://newsapi.org/v2/everything'
-    api_key = os.environ.get("api_key")  # Make sure the key name matches what's in HF settings
-    if not api_key:
-        raise ValueError("API_KEY is not set in environment variables.")
     params = {
-        'apiKey': api_key,
         'language': 'en',
         'q': topic,
-        'pageSize': 20
     }
-    try:
-        response = requests.get(url, params=params)
-        if response.status_code != 200:
-            return f"Error: Failed to fetch news. Status code: {response.status_code}"
-        articles = response.json().get("articles", [])
-        if not articles:
-            return "No articles found."
-        # Extract relevant information from each article
-        extracted_articles = []
-        for article in articles:
-            extracted_articles.append({
-                "title": article.get("title", "No title"),
-                "url": article.get("url", "#"),
-                "source": article.get("source", {}).get("name", "Unknown"),
-                "author": article.get("author", "Unknown"),
-                "publishedAt": article.get("publishedAt", "Unknown")
-            })
-        return extracted_articles
-    except Exception as e:
-        return f"Error fetching news: {str(e)}"
-def fetch_articles_google(topic):
-    """
-    Fetch articles from Google News RSS feed based on the provided topic.
-    """
-    rss_url = f'https://news.google.com/rss/search?q={topic}&hl=en-US&gl=US&ceid=US:en'
-    try:
-        feed = feedparser.parse(rss_url)
-        if not feed.entries:
-            return "No articles found."
-        # Extract relevant information from each article
-        extracted_articles = []
-        for entry in feed.entries[:20]:  # Limit to top 20 articles
-            extracted_articles.append({
-                "title": entry.title,
-                "url": entry.link,
-                "source": entry.source.title if hasattr(entry, 'source') else "Unknown",
-                "author": entry.author if hasattr(entry, 'author') else "Unknown",
-                "publishedAt": entry.published if hasattr(entry, 'published') else "Unknown"
-            })
-        return extracted_articles
-    except Exception as e:
-        return f"Error fetching news: {str(e)}"

 # gather_news.py
 # News Source Integration
+# This script integrates with various news sources to fetch the latest articles from the specified news sources,
+# extracts relevant information such as title, URL, Source, Author and Publish date, and extracts full content.
 import requests
+from extract_news import extract_news_articles, create_dataframe, save_to_csv
+def fetch_newsapi_top_headlines(min_length=100, max_articles=30):
+    import config
+    url = 'https://newsapi.org/v2/top-headlines'
+    params = {
+        'apiKey': config.api_key,
+        'language': 'en',
+        'pageSize': max_articles
+    }
+    response = requests.get(url, params=params)
+    if response.status_code != 200:
+        print(f"Error: Failed to fetch news from NewsAPI Top Headlines. Status code: {response.status_code}")
+        return []
+    articles = response.json().get("articles", [])
+    if not articles:
+        print("No articles found in NewsAPI Top Headlines.")
+        return []
+    meta_by_url = {}
+    urls = []
+    for article in articles:
+        url = article.get("url", "#")
+        meta = {
+            "url": url,
+            "title": article.get("title", ""),
+            "source": article.get("source", {}).get("name", ""),
+            "author": article.get("author", "Unknown"),
+            "publishedAt": article.get("publishedAt", "Unknown"),
+        }
+        meta_by_url[url] = meta
+        urls.append(url)
+    print(f"Fetched {len(urls)} article URLs from NewsAPI Top Headlines.")
+    extracted_articles = extract_news_articles(urls, min_length=min_length)
+    merged_articles = []
+    for art in extracted_articles:
+        meta = meta_by_url.get(art.get("original_url"))
+        if not meta:
+            meta = {
+                "title": art.get("title", "Untitled"),
+                "source": "",
+                "author": "Unknown",
+                "publishedAt": "Unknown"
+            }
+        merged = {
+            "url": art.get("url"),
+            "title": art.get("title") if art.get("title") and art.get("title") != "Untitled" else meta["title"],
+            "source": meta["source"],
+            "author": meta["author"],
+            "publishedAt": meta["publishedAt"],
+            "text": art.get("text", ""),
+        }
+        merged_articles.append(merged)
+    print(f"Usable articles after extraction (NewsAPI Top Headlines): {len(merged_articles)}")
+    return merged_articles
+def fetch_newsapi_everything(topic, min_length=100, max_articles=50):
+    import config
     url = 'https://newsapi.org/v2/everything'
     params = {
+        'apiKey': config.api_key,
         'language': 'en',
         'q': topic,
+        'pageSize': max_articles,
+        'sortBy': 'publishedAt'
     }
+    response = requests.get(url, params=params)
+    if response.status_code != 200:
+        print(f"Error: Failed to fetch news from NewsAPI Everything. Status code: {response.status_code}")
+        return []
+    articles = response.json().get("articles", [])
+    if not articles:
+        print("No articles found in NewsAPI Everything.")
+        return []
+    meta_by_url = {}
+    urls = []
+    for article in articles:
+        url = article.get("url", "#")
+        meta = {
+            "url": url,
+            "title": article.get("title", ""),
+            "source": article.get("source", {}).get("name", ""),
+            "author": article.get("author", "Unknown"),
+            "publishedAt": article.get("publishedAt", "Unknown"),
+        }
+        meta_by_url[url] = meta
+        urls.append(url)
+    print(f"Fetched {len(urls)} article URLs from NewsAPI Everything.")
+    extracted_articles = extract_news_articles(urls, min_length=min_length)
+    merged_articles = []
+    for art in extracted_articles:
+        meta = meta_by_url.get(art.get("original_url"))
+        if not meta:
+            meta = {
+                "title": art.get("title", "Untitled"),
+                "source": "",
+                "author": "Unknown",
+                "publishedAt": "Unknown"
+            }
+        merged = {
+            "url": art.get("url"),
+            "title": art.get("title") if art.get("title") and art.get("title") != "Untitled" else meta["title"],
+            "source": meta["source"],
+            "author": meta["author"],
+            "publishedAt": meta["publishedAt"],
+            "text": art.get("text", ""),
+        }
+        merged_articles.append(merged)
+    print(f"Usable articles after extraction (NewsAPI Everything): {len(merged_articles)}")
+    return merged_articles
+def fetch_articles(topic=None, min_length=100, max_articles=30):
+    if topic and topic.strip():
+        return fetch_newsapi_everything(topic, min_length=min_length, max_articles=max_articles)
+    else:
+        return fetch_newsapi_top_headlines(min_length=min_length, max_articles=max_articles)

input_topic.py CHANGED Viewed

@@ -1,7 +1,4 @@
 # input_topic.py
-# Input Design
 # This script is designed to take user input for a topic or a keyword and validate it before using it in a news summarization application.
 def get_topic():

 # input_topic.py
 # This script is designed to take user input for a topic or a keyword and validate it before using it in a news summarization application.
 def get_topic():

requirements.txt CHANGED Viewed

@@ -9,4 +9,7 @@ numpy
 requests
 gradio
 lxml_html_clean
 sentence_transformers

 requests
 gradio
 lxml_html_clean
+plotly.express
+hdbscan
+umap
 sentence_transformers

summarizer.py CHANGED Viewed

@@ -6,14 +6,6 @@ from transformers import pipeline
 # Load summarization pipeline
 summarizer = pipeline("summarization", model="harao-ml/flant5-finetuned-summarize")
-# Load once globally
-#from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-#tokenizer = AutoTokenizer.from_pretrained("flant5-base")
-#model = AutoModelForSeq2SeqLM.from_pretrained("flant5-base")
-#summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
 # Function to split text into smaller chunks
 def split_text(text, max_tokens=512):
     words = text.split()

 # Load summarization pipeline
 summarizer = pipeline("summarization", model="harao-ml/flant5-finetuned-summarize")
 # Function to split text into smaller chunks
 def split_text(text, max_tokens=512):
     words = text.split()