Spaces:

harao-ml
/

QuickPulse

Running

App Files Files Community

harao-ml commited on 27 days ago

Commit

7c3be27

verified ·

1 Parent(s): d0c5c2c

Upload 6 files

Browse files

Files changed (6) hide show

analyze_sentiment.py +28 -0
cluster_news.py +224 -0
extract_news.py +244 -0
gather_news.py +70 -0
input_topic.py +50 -0
summarizer.py +40 -0

analyze_sentiment.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# analyze_sentiment.py
+# This script analyzes the sentiment of the summarized content using the Hugging Face Transformers library.
+from transformers import pipeline
+# Load sentiment analysis pipeline
+sentiment_analyzer = pipeline("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")
+def analyze_summary(summary):
+    """
+    Analyze the sentiment of the given summary.
+    Returns a tuple of (sentiment, score).
+    """
+    try:
+        if not summary.strip():
+            return "No input provided.", 0.0
+        result = sentiment_analyzer(summary)[0]
+        sentiment = result['label']
+        score = result['score']
+        return sentiment, score
+    except Exception as e:
+        return f"Error analyzing sentiment: {str(e)}", 0.0
+# Example usage

cluster_news.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import numpy as np
+import pandas as pd
+from sklearn.cluster import KMeans
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.decomposition import LatentDirichletAllocation
+from sklearn.metrics import silhouette_score
+from collections import defaultdict
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+def generate_embeddings(df, content_column):
+    """
+    Generate embeddings for the content using SentenceTransformer.
+    """
+    print("🔢 Generating embeddings for clustering...")
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    embeddings = model.encode(df[content_column].tolist(), show_progress_bar=True)
+    return embeddings
+def determine_optimum_clusters(embeddings, min_clusters=2, max_clusters=10):
+    """
+    Determine the optimum number of clusters using silhouette analysis.
+    """
+    print("🔍 Determining the optimum number of clusters using silhouette analysis...")
+    n_samples = len(embeddings)
+    if n_samples < 2:
+        raise ValueError("Not enough samples to perform clustering. At least 2 samples are required.")
+    # Adjust max_clusters to ensure it does not exceed n_samples - 1
+    max_clusters = min(max_clusters, n_samples - 1)
+    best_num_clusters = min_clusters
+    best_score = -1
+    for n_clusters in range(min_clusters, max_clusters + 1):
+        try:
+            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+            cluster_labels = kmeans.fit_predict(embeddings)
+            score = silhouette_score(embeddings, cluster_labels)
+            print(f"Number of clusters: {n_clusters}, Silhouette Score: {score:.4f}")
+            if score > best_score:
+                best_score = score
+                best_num_clusters = n_clusters
+        except ValueError as e:
+            print(f"Skipping {n_clusters} clusters due to error: {e}")
+    print(f"✅ Optimum number of clusters determined: {best_num_clusters}")
+    return best_num_clusters
+def cluster_embeddings(embeddings, num_clusters):
+    """
+    Perform KMeans clustering on the embeddings.
+    """
+    print(f"📊 Clustering articles into {num_clusters} clusters using KMeans...")
+    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
+    kmeans.fit(embeddings)
+    return kmeans.labels_, kmeans
+def extract_tfidf_labels(df, content_column, cluster_labels):
+    """
+    Extract top TF-IDF keywords for each cluster.
+    """
+    print("🔠 Extracting TF-IDF-based keywords for cluster labels...")
+    grouped = defaultdict(list)
+    for idx, label in enumerate(cluster_labels):
+        grouped[label].append(df.iloc[idx][content_column])
+    tfidf_labels = {}
+    for cluster_id, texts in grouped.items():
+        vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=50)
+        tfidf_matrix = vectorizer.fit_transform(texts)
+        avg_tfidf = tfidf_matrix.mean(axis=0).A1
+        top_indices = np.argsort(avg_tfidf)[::-1][:3]
+        top_terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
+        tfidf_labels[cluster_id] = ", ".join(top_terms)
+    return tfidf_labels
+def apply_topic_modeling(df, content_column, cluster_labels, num_topics=2):
+    """
+    Apply topic modeling (LDA) within each cluster to refine and describe topics.
+    """
+    print("🔍 Applying topic modeling within each cluster...")
+    grouped = defaultdict(list)
+    for idx, label in enumerate(cluster_labels):
+        grouped[label].append(df.iloc[idx][content_column])
+    topic_labels = {}
+    for cluster_id, texts in grouped.items():
+        vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=5000)
+        tfidf_matrix = vectorizer.fit_transform(texts)
+        lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
+        lda.fit(tfidf_matrix)
+        # Extract top words for each topic
+        feature_names = vectorizer.get_feature_names_out()
+        topics = []
+        for topic_idx, topic in enumerate(lda.components_):
+            top_indices = topic.argsort()[:-4:-1]
+            topics.append(", ".join([feature_names[i] for i in top_indices]))
+        topic_labels[cluster_id] = " | ".join(topics)
+    return topic_labels
+def filter_similar_topics(topic_keywords_list, threshold=0.75):
+    """
+    Filter out similar topics based on cosine similarity of their embeddings.
+    """
+    print("🔄 Filtering similar topics...")
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    topic_sentences = [", ".join(kw) for kw in topic_keywords_list]
+    embeddings = model.encode(topic_sentences)
+    unique_indices = []
+    for i, emb in enumerate(embeddings):
+        if all(cosine_similarity([emb], [embeddings[j]])[0][0] < threshold for j in unique_indices):
+            unique_indices.append(i)
+    return [topic_keywords_list[i] for i in unique_indices]
+def get_representative_summaries(df, summary_column, embeddings, cluster_labels, kmeans):
+    """
+    Get the most representative summary for each cluster based on proximity to the cluster centroid.
+    """
+    print("🔄 Refining cluster labels using representative summaries...")
+    representatives = {}
+    for i in range(kmeans.n_clusters):
+        indices = [j for j, label in enumerate(cluster_labels) if label == i]
+        if not indices:
+            continue
+        cluster_embeddings = embeddings[indices]
+        centroid = kmeans.cluster_centers_[i]
+        distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
+        closest_idx = indices[np.argmin(distances)]
+        representatives[i] = df.iloc[closest_idx][summary_column]
+    return representatives
+def cluster_and_label_articles(df, content_column="content", summary_column="summary", min_clusters=2, max_clusters=10, max_topics=3):
+    """
+    Cluster articles using SentenceTransformer embeddings and label clusters using TF-IDF and Topic Modeling.
+    Display detected topics for each cluster with Primary focus and Related topics.
+    """
+    if df.empty:
+        print("No articles to cluster.")
+        return None
+    # Step 1: Generate embeddings
+    embeddings = generate_embeddings(df, content_column)
+    # Step 2: Determine the optimum number of clusters
+    num_clusters = determine_optimum_clusters(embeddings, min_clusters, max_clusters)
+    # Step 3: Perform clustering
+    cluster_labels, kmeans = cluster_embeddings(embeddings, num_clusters)
+    df['cluster_label'] = cluster_labels
+    # Step 4: Extract TF-IDF matrix
+    print("🔠 Extracting TF-IDF matrix for clusters...")
+    vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=5000)
+    tfidf_matrix = vectorizer.fit_transform(df[content_column].tolist())
+    feature_names = vectorizer.get_feature_names_out()
+    # Step 5: Process each cluster
+    print("🔍 Processing clusters for TF-IDF and topic modeling...")
+    grouped = defaultdict(list)
+    for idx, label in enumerate(cluster_labels):
+        grouped[label].append(idx)
+    refined_labels = [""] * num_clusters  # Initialize refined_labels with empty strings
+    detected_topics = {}
+    for cluster_id, indices in grouped.items():
+        cluster_texts = tfidf_matrix[indices]
+        # Extract TF-IDF keywords
+        avg_tfidf = cluster_texts.mean(axis=0).A1
+        top_indices = np.argsort(avg_tfidf)[::-1][:3]
+        tfidf_keywords = [feature_names[i] for i in top_indices]
+        # Generate a cluster label using the top TF-IDF keywords
+        cluster_label_tfidf = ", ".join(tfidf_keywords)
+        # Apply topic modeling
+        lda = LatentDirichletAllocation(n_components=min(max_topics, len(indices)), random_state=42)
+        lda.fit(cluster_texts)
+        topics = []
+        topic_weights = []
+        for topic_idx, topic in enumerate(lda.components_):
+            top_topic_indices = topic.argsort()[:-4:-1]
+            topics.append(", ".join([feature_names[i] for i in top_topic_indices]))
+            topic_weights.append(topic.sum())  # Sum of weights for ranking
+        # Rank topics by importance
+        ranked_topics = [x for _, x in sorted(zip(topic_weights, topics), reverse=True)]
+        # Generate Primary focus and Related topics
+        primary_focus = ranked_topics[0] if ranked_topics else "N/A"
+        related_topics = ranked_topics[1:] if len(ranked_topics) > 1 else []
+        # Store detected topics for user display
+        detected_topics[cluster_label_tfidf] = {
+            "primary_focus": primary_focus,
+            "related_topics": related_topics,
+        }
+        # Assign the TF-IDF keywords as the cluster label
+        refined_labels[cluster_id] = cluster_label_tfidf
+    # Assign refined labels to clusters
+    df['cluster_label'] = [refined_labels[label] for label in cluster_labels]
+    print("✅ Clustering and labeling complete!")
+    return {
+        "dataframe": df,
+        "detected_topics": detected_topics,
+        "number_of_clusters": num_clusters,
+    }

extract_news.py ADDED Viewed

	@@ -0,0 +1,244 @@

+# extract_news.py
+# This script is designed to extract news articles from various sources, including NewsAPI and Google News RSS using the URLs saved from the gather_news.py file.
+# It includes functions for extracting  clean,full-text content from the articles, and storing the metadata into a file.
+# Article Scraping & Text Extraction
+from newspaper import Article
+import pandas as pd
+import logging
+import requests
+from bs4 import BeautifulSoup
+# * For each URL from NewsAPI or RSS, * Create Article(url)* Call .download(), .parse(), .text and * Optionally use .nlp() to get summary and keywords
+def extract_full_content(url, min_length=300):
+    """
+    Extract full content and title from the given URL using newspaper3k.
+    Always returns a tuple (content, title) or (None, None).
+    """
+    try:
+        article = Article(url)
+        article.download()
+        article.parse()
+        text = article.text.strip()
+        title = article.title.strip() if article.title else "Untitled"
+        # Filter out short content
+        if len(text) < min_length:
+            logging.warning(f"Extracted content is too short from {url}.")
+            return None, None
+        return text, title
+    except Exception as e:
+        logging.error(f"Failed to extract content from {url}: {str(e)}")
+        return None, None
+def extract_full_content_rss(url, min_length=300):
+    """
+    Extract full content and title from an RSS article using BeautifulSoup.
+    Always returns a tuple: (text, title) or (None, None).
+    """
+    try:
+        response = requests.get(url, timeout=10)
+        if response.status_code != 200:
+            logging.error(f"Error fetching URL {url}: {response.status_code}")
+            return None, None
+        soup = BeautifulSoup(response.content, 'html.parser')
+        title = soup.title.string.strip() if soup.title and soup.title.string else "Untitled"
+        paragraphs = soup.find_all('p')
+        text = ' '.join([para.get_text() for para in paragraphs]).strip()
+        if len(text) < min_length:
+            logging.warning(f"Extracted content is too short from {url}.")
+            return None, None
+        return text, title
+    except Exception as e:
+        logging.error(f"Error extracting content from {url}: {str(e)}")
+        return None, None
+# * Handle common edge cases such as * Paywalled content (skip or tag) and * Duplicate links or broken URLs
+def is_paywalled(url):
+    """
+    * Check if the URL is paywalled
+    """
+    paywall_indicators = ['paywall', 'subscription', 'premium']
+    return any(indicator in url for indicator in paywall_indicators)
+def is_paywalled_content(article):
+    """
+    * Check if the article is paywalled
+    """
+    if not article:
+        return False
+    if not article.get("text"):
+        return False
+    if is_paywalled(article.get("url", "")):
+        return True
+    return False
+def is_duplicate(url, existing_urls):
+    """
+    * Check if the URL is a duplicate
+    """
+    return url in existing_urls
+def is_broken(url):
+    """
+    * Check if the URL is broken
+    """
+    try:
+        response = requests.head(url, allow_redirects=True)
+        return response.status_code != 200
+    except requests.RequestException:
+        return True
+def is_valid_url(url):
+    """
+    * Check if the URL is valid
+    """
+    regex = re.compile(
+        r'^(?:http|ftp)s?://'  # http:// or https://
+        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
+        r'localhost|'  # localhost...
+        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'  # ...or ipv4
+        r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'  # ...or ipv6
+        r'(?::\d+)?'  # optional port
+        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
+    return re.match(regex, url) is not None
+def is_valid_url_content(url):
+    """
+    * Check if the URL is valid
+    """
+    if not url:
+        return False
+    if not is_valid_url(url):
+        return False
+    if is_paywalled(url):
+        return False
+    if is_broken(url):
+        return False
+    return True
+# Additional functions to check if the article have empty content or blocked sites
+def is_empty_content(article):
+    """
+    * Check if the article content is empty
+    """
+    if not article:
+        return True
+    if not article.get("text"):
+        return True
+    return False
+def is_blocked_site(url):
+    """
+    * Check if the URL is from a blocked site
+    """
+    blocked_sites = ['example.com', 'blockedsite.com']  # Add your blocked sites here
+    return any(blocked_site in url for blocked_site in blocked_sites)
+def is_blocked_content(article):
+    """
+    * Check if the article is from a blocked site
+    """
+    if not article:
+        return False
+    if not article.get("text"):
+        return False
+    if is_blocked_site(article.get("url", "")):
+        return True
+    return False
+#  Extract news articles from the given URLs
+def extract_news_articles(urls):
+    """
+    * Extract news articles from the given URLs
+    """
+    extracted_articles = []
+    existing_urls = set()
+    for url in urls:
+        if not is_valid_url_content(url):
+            logging.warning(f"Skipping invalid or paywalled URL: {url}")
+            continue
+        if is_duplicate(url, existing_urls):
+            logging.warning(f"Skipping duplicate URL: {url}")
+            continue
+        existing_urls.add(url)
+        article = extract_full_content(url)
+        if not article:
+            logging.warning(f"Failed to extract content from {url}")
+            continue
+        if is_paywalled_content(article):
+            logging.warning(f"Skipping paywalled content from URL: {url}")
+            continue
+        extracted_articles.append(article)
+    return extracted_articles
+def extract_news_articles_rss(urls):
+    """
+    * Extract news articles from the given RSS URLs
+    """
+    extracted_articles = []
+    existing_urls = set()
+    for url in urls:
+        if not is_valid_url_content(url):
+            logging.warning(f"Skipping invalid or paywalled URL: {url}")
+            continue
+        if is_duplicate(url, existing_urls):
+            logging.warning(f"Skipping duplicate URL: {url}")
+            continue
+        existing_urls.add(url)
+        article = extract_full_content_rss(url)
+        if not article:
+            logging.warning(f"Failed to extract content from {url}")
+            continue
+        if is_paywalled_content(article):
+            logging.warning(f"Skipping paywalled content from URL: {url}")
+            continue
+        extracted_articles.append(article)
+    return extracted_articles
+# Metadata Structuring and Storage
+# Functions to create a dataframe with all the metadata for extracted fields title,url,source,author, published_at and full_text for each extracted article and save it to a csv file
+def create_dataframe(articles):
+    """
+    Create a pandas DataFrame from the list of articles.
+    """
+    return pd.DataFrame(articles)
+def save_to_csv(df, filename):
+    """
+    Save the DataFrame to a CSV file.
+    """
+    df.to_csv(filename, index=False)
+def save_to_json(df, filename):
+    """
+    Save the DataFrame to a JSON file.
+    """
+    df.to_json(filename, orient="records", lines=True)

gather_news.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# gather_news.py
+# News Source Integration
+# This script integrates with various news sources to fetch the latest articles from the specified news sources, extracts relevant information such as title, URL,Source,Author and Publish date.
+import config
+import requests
+import feedparser
+def fetch_articles_newsapi(topic):
+    """
+    Fetch articles from NewsAPI based on the provided topic.
+    """
+    url = 'https://newsapi.org/v2/everything'
+    params = {
+        'apiKey': config.api_key,
+        'language': 'en',
+        'q': topic,
+        'pageSize': 20
+    }
+    try:
+        response = requests.get(url, params=params)
+        if response.status_code != 200:
+            return f"Error: Failed to fetch news. Status code: {response.status_code}"
+        articles = response.json().get("articles", [])
+        if not articles:
+            return "No articles found."
+        # Extract relevant information from each article
+        extracted_articles = []
+        for article in articles:
+            extracted_articles.append({
+                "title": article.get("title", "No title"),
+                "url": article.get("url", "#"),
+                "source": article.get("source", {}).get("name", "Unknown"),
+                "author": article.get("author", "Unknown"),
+                "publishedAt": article.get("publishedAt", "Unknown")
+            })
+        return extracted_articles
+    except Exception as e:
+        return f"Error fetching news: {str(e)}"
+def fetch_articles_google(topic):
+    """
+    Fetch articles from Google News RSS feed based on the provided topic.
+    """
+    rss_url = f'https://news.google.com/rss/search?q={topic}&hl=en-US&gl=US&ceid=US:en'
+    try:
+        feed = feedparser.parse(rss_url)
+        if not feed.entries:
+            return "No articles found."
+        # Extract relevant information from each article
+        extracted_articles = []
+        for entry in feed.entries[:20]:  # Limit to top 20 articles
+            extracted_articles.append({
+                "title": entry.title,
+                "url": entry.link,
+                "source": entry.source.title if hasattr(entry, 'source') else "Unknown",
+                "author": entry.author if hasattr(entry, 'author') else "Unknown",
+                "publishedAt": entry.published if hasattr(entry, 'published') else "Unknown"
+            })
+        return extracted_articles
+    except Exception as e:
+        return f"Error fetching news: {str(e)}"

input_topic.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# input_topic.py
+# Input Design
+# This script is designed to take user input for a topic or a keyword and validate it before using it in a news summarization application.
+def get_topic():
+    topic = input("Enter a topic to search for news articles: ")
+    if not topic:
+        print("No topic provided. Please enter a valid topic.")
+        return None
+    if len(topic) > 100:  # Arbitrary limit for topic length
+        print("Topic is too long. Please enter a shorter topic.")
+        return None
+    if not topic.isascii():
+        print("Topic contains non-ASCII characters. Please use only ASCII characters.")
+        return None
+    if not topic.isprintable():
+        print("Topic contains non-printable characters. Please use only printable characters.")
+        return None
+    if topic[0].isdigit():
+        print("Topic should not start with a digit. Please enter a valid topic.")
+        return None
+    if topic[0] == ' ':
+        print("Topic should not start with a space. Please enter a valid topic.")
+        return None
+    # Normalize the input to lowercase and strip any leading/trailing whitespace.
+    topic = topic.lower().strip()
+    # Check for special characters and replace them with spaces.
+    special_chars = ['!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '=', '+', '{', '}', '[', ']', '|', ':', ';', '"', "'", '<', '>', ',', '.', '?', '/', '\\']
+    for char in special_chars:
+        topic = topic.replace(char, ' ')
+    # Remove extra spaces
+    topic = ' '.join(topic.split())
+    # Check if the topic is empty after normalization
+    if not topic:
+        print("Topic is empty after normalization. Please enter a valid topic.")
+        return None
+    # Check for common stop words and remove them
+    stop_words = ['the', 'is', 'in', 'and', 'to', 'a', 'of', 'for', 'on', 'with', 'as', 'by', 'this', 'that']
+    topic_words = topic.split()
+    topic = ' '.join([word for word in topic_words if word not in stop_words])
+    # Check if the topic is empty after removing stop words
+    if not topic:
+        print("Topic is empty after removing stop words. Please enter a valid topic.")
+        return None
+    return topic

summarizer.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# summarizer.py
+# This script summarizes the content of each article of the specified topic using the Hugging Face Transformers library.
+from transformers import pipeline
+# Load summarization pipeline
+summarizer = pipeline("summarization", model="harao-ml/flant5-finetuned-summarize")
+# Load once globally
+#from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+#tokenizer = AutoTokenizer.from_pretrained("flant5-base")
+#model = AutoModelForSeq2SeqLM.from_pretrained("flant5-base")
+#summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
+# Function to split text into smaller chunks
+def split_text(text, max_tokens=512):
+    words = text.split()
+    for i in range(0, len(words), max_tokens):
+        yield ' '.join(words[i:i + max_tokens])
+# Function to clean text
+def clean_text(text):
+    text = ' '.join(text.split())
+    text = ' '.join(word for word in text.split() if len(word) < 100)
+    return text
+def generate_summary(content):
+    try:
+        if not content.strip():
+                return "No input provided."
+        text = content
+        cleaned_text = clean_text(text)
+        chunks = list(split_text(cleaned_text))
+        cons_summary = ''.join([summarizer(chunk, do_sample=False)[0]['summary_text'] for chunk in chunks if chunk.strip()]) if chunks else ''
+        summary = summarizer(text, do_sample=False)[0]['summary_text']
+        return summary
+    except Exception as e:
+        return f"Error generating summary: {str(e)}"