harao-ml commited on
Commit
7c3be27
·
verified ·
1 Parent(s): d0c5c2c

Upload 6 files

Browse files
Files changed (6) hide show
  1. analyze_sentiment.py +28 -0
  2. cluster_news.py +224 -0
  3. extract_news.py +244 -0
  4. gather_news.py +70 -0
  5. input_topic.py +50 -0
  6. summarizer.py +40 -0
analyze_sentiment.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # analyze_sentiment.py
2
+
3
+ # This script analyzes the sentiment of the summarized content using the Hugging Face Transformers library.
4
+
5
+
6
+ from transformers import pipeline
7
+
8
+
9
+ # Load sentiment analysis pipeline
10
+ sentiment_analyzer = pipeline("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")
11
+
12
+ def analyze_summary(summary):
13
+ """
14
+ Analyze the sentiment of the given summary.
15
+ Returns a tuple of (sentiment, score).
16
+ """
17
+ try:
18
+ if not summary.strip():
19
+ return "No input provided.", 0.0
20
+
21
+ result = sentiment_analyzer(summary)[0]
22
+ sentiment = result['label']
23
+ score = result['score']
24
+
25
+ return sentiment, score
26
+ except Exception as e:
27
+ return f"Error analyzing sentiment: {str(e)}", 0.0
28
+ # Example usage
cluster_news.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.cluster import KMeans
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.decomposition import LatentDirichletAllocation
6
+ from sklearn.metrics import silhouette_score
7
+ from collections import defaultdict
8
+ from sentence_transformers import SentenceTransformer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+
11
+
12
+ def generate_embeddings(df, content_column):
13
+ """
14
+ Generate embeddings for the content using SentenceTransformer.
15
+ """
16
+ print("🔢 Generating embeddings for clustering...")
17
+ model = SentenceTransformer('all-MiniLM-L6-v2')
18
+ embeddings = model.encode(df[content_column].tolist(), show_progress_bar=True)
19
+ return embeddings
20
+
21
+
22
+ def determine_optimum_clusters(embeddings, min_clusters=2, max_clusters=10):
23
+ """
24
+ Determine the optimum number of clusters using silhouette analysis.
25
+ """
26
+ print("🔍 Determining the optimum number of clusters using silhouette analysis...")
27
+ n_samples = len(embeddings)
28
+ if n_samples < 2:
29
+ raise ValueError("Not enough samples to perform clustering. At least 2 samples are required.")
30
+
31
+ # Adjust max_clusters to ensure it does not exceed n_samples - 1
32
+ max_clusters = min(max_clusters, n_samples - 1)
33
+
34
+ best_num_clusters = min_clusters
35
+ best_score = -1
36
+
37
+ for n_clusters in range(min_clusters, max_clusters + 1):
38
+ try:
39
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
40
+ cluster_labels = kmeans.fit_predict(embeddings)
41
+ score = silhouette_score(embeddings, cluster_labels)
42
+ print(f"Number of clusters: {n_clusters}, Silhouette Score: {score:.4f}")
43
+
44
+ if score > best_score:
45
+ best_score = score
46
+ best_num_clusters = n_clusters
47
+ except ValueError as e:
48
+ print(f"Skipping {n_clusters} clusters due to error: {e}")
49
+
50
+ print(f"✅ Optimum number of clusters determined: {best_num_clusters}")
51
+ return best_num_clusters
52
+
53
+
54
+ def cluster_embeddings(embeddings, num_clusters):
55
+ """
56
+ Perform KMeans clustering on the embeddings.
57
+ """
58
+ print(f"📊 Clustering articles into {num_clusters} clusters using KMeans...")
59
+ kmeans = KMeans(n_clusters=num_clusters, random_state=42)
60
+ kmeans.fit(embeddings)
61
+ return kmeans.labels_, kmeans
62
+
63
+
64
+ def extract_tfidf_labels(df, content_column, cluster_labels):
65
+ """
66
+ Extract top TF-IDF keywords for each cluster.
67
+ """
68
+ print("🔠 Extracting TF-IDF-based keywords for cluster labels...")
69
+ grouped = defaultdict(list)
70
+ for idx, label in enumerate(cluster_labels):
71
+ grouped[label].append(df.iloc[idx][content_column])
72
+
73
+ tfidf_labels = {}
74
+ for cluster_id, texts in grouped.items():
75
+ vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=50)
76
+ tfidf_matrix = vectorizer.fit_transform(texts)
77
+ avg_tfidf = tfidf_matrix.mean(axis=0).A1
78
+ top_indices = np.argsort(avg_tfidf)[::-1][:3]
79
+ top_terms = [vectorizer.get_feature_names_out()[i] for i in top_indices]
80
+ tfidf_labels[cluster_id] = ", ".join(top_terms)
81
+
82
+ return tfidf_labels
83
+
84
+ def apply_topic_modeling(df, content_column, cluster_labels, num_topics=2):
85
+ """
86
+ Apply topic modeling (LDA) within each cluster to refine and describe topics.
87
+ """
88
+ print("🔍 Applying topic modeling within each cluster...")
89
+ grouped = defaultdict(list)
90
+ for idx, label in enumerate(cluster_labels):
91
+ grouped[label].append(df.iloc[idx][content_column])
92
+
93
+ topic_labels = {}
94
+ for cluster_id, texts in grouped.items():
95
+ vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=5000)
96
+ tfidf_matrix = vectorizer.fit_transform(texts)
97
+
98
+ lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
99
+ lda.fit(tfidf_matrix)
100
+
101
+ # Extract top words for each topic
102
+ feature_names = vectorizer.get_feature_names_out()
103
+ topics = []
104
+ for topic_idx, topic in enumerate(lda.components_):
105
+ top_indices = topic.argsort()[:-4:-1]
106
+ topics.append(", ".join([feature_names[i] for i in top_indices]))
107
+ topic_labels[cluster_id] = " | ".join(topics)
108
+
109
+ return topic_labels
110
+
111
+
112
+ def filter_similar_topics(topic_keywords_list, threshold=0.75):
113
+ """
114
+ Filter out similar topics based on cosine similarity of their embeddings.
115
+ """
116
+ print("🔄 Filtering similar topics...")
117
+ model = SentenceTransformer('all-MiniLM-L6-v2')
118
+ topic_sentences = [", ".join(kw) for kw in topic_keywords_list]
119
+ embeddings = model.encode(topic_sentences)
120
+ unique_indices = []
121
+ for i, emb in enumerate(embeddings):
122
+ if all(cosine_similarity([emb], [embeddings[j]])[0][0] < threshold for j in unique_indices):
123
+ unique_indices.append(i)
124
+ return [topic_keywords_list[i] for i in unique_indices]
125
+
126
+
127
+ def get_representative_summaries(df, summary_column, embeddings, cluster_labels, kmeans):
128
+ """
129
+ Get the most representative summary for each cluster based on proximity to the cluster centroid.
130
+ """
131
+ print("🔄 Refining cluster labels using representative summaries...")
132
+ representatives = {}
133
+ for i in range(kmeans.n_clusters):
134
+ indices = [j for j, label in enumerate(cluster_labels) if label == i]
135
+ if not indices:
136
+ continue
137
+ cluster_embeddings = embeddings[indices]
138
+ centroid = kmeans.cluster_centers_[i]
139
+ distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
140
+ closest_idx = indices[np.argmin(distances)]
141
+ representatives[i] = df.iloc[closest_idx][summary_column]
142
+
143
+ return representatives
144
+
145
+
146
+ def cluster_and_label_articles(df, content_column="content", summary_column="summary", min_clusters=2, max_clusters=10, max_topics=3):
147
+ """
148
+ Cluster articles using SentenceTransformer embeddings and label clusters using TF-IDF and Topic Modeling.
149
+ Display detected topics for each cluster with Primary focus and Related topics.
150
+ """
151
+ if df.empty:
152
+ print("No articles to cluster.")
153
+ return None
154
+
155
+ # Step 1: Generate embeddings
156
+ embeddings = generate_embeddings(df, content_column)
157
+
158
+ # Step 2: Determine the optimum number of clusters
159
+ num_clusters = determine_optimum_clusters(embeddings, min_clusters, max_clusters)
160
+
161
+ # Step 3: Perform clustering
162
+ cluster_labels, kmeans = cluster_embeddings(embeddings, num_clusters)
163
+ df['cluster_label'] = cluster_labels
164
+
165
+ # Step 4: Extract TF-IDF matrix
166
+ print("🔠 Extracting TF-IDF matrix for clusters...")
167
+ vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english", max_features=5000)
168
+ tfidf_matrix = vectorizer.fit_transform(df[content_column].tolist())
169
+ feature_names = vectorizer.get_feature_names_out()
170
+
171
+ # Step 5: Process each cluster
172
+ print("🔍 Processing clusters for TF-IDF and topic modeling...")
173
+ grouped = defaultdict(list)
174
+ for idx, label in enumerate(cluster_labels):
175
+ grouped[label].append(idx)
176
+
177
+ refined_labels = [""] * num_clusters # Initialize refined_labels with empty strings
178
+ detected_topics = {}
179
+ for cluster_id, indices in grouped.items():
180
+ cluster_texts = tfidf_matrix[indices]
181
+
182
+ # Extract TF-IDF keywords
183
+ avg_tfidf = cluster_texts.mean(axis=0).A1
184
+ top_indices = np.argsort(avg_tfidf)[::-1][:3]
185
+ tfidf_keywords = [feature_names[i] for i in top_indices]
186
+
187
+ # Generate a cluster label using the top TF-IDF keywords
188
+ cluster_label_tfidf = ", ".join(tfidf_keywords)
189
+
190
+ # Apply topic modeling
191
+ lda = LatentDirichletAllocation(n_components=min(max_topics, len(indices)), random_state=42)
192
+ lda.fit(cluster_texts)
193
+ topics = []
194
+ topic_weights = []
195
+ for topic_idx, topic in enumerate(lda.components_):
196
+ top_topic_indices = topic.argsort()[:-4:-1]
197
+ topics.append(", ".join([feature_names[i] for i in top_topic_indices]))
198
+ topic_weights.append(topic.sum()) # Sum of weights for ranking
199
+
200
+ # Rank topics by importance
201
+ ranked_topics = [x for _, x in sorted(zip(topic_weights, topics), reverse=True)]
202
+
203
+ # Generate Primary focus and Related topics
204
+ primary_focus = ranked_topics[0] if ranked_topics else "N/A"
205
+ related_topics = ranked_topics[1:] if len(ranked_topics) > 1 else []
206
+
207
+ # Store detected topics for user display
208
+ detected_topics[cluster_label_tfidf] = {
209
+ "primary_focus": primary_focus,
210
+ "related_topics": related_topics,
211
+ }
212
+
213
+ # Assign the TF-IDF keywords as the cluster label
214
+ refined_labels[cluster_id] = cluster_label_tfidf
215
+
216
+ # Assign refined labels to clusters
217
+ df['cluster_label'] = [refined_labels[label] for label in cluster_labels]
218
+
219
+ print("✅ Clustering and labeling complete!")
220
+ return {
221
+ "dataframe": df,
222
+ "detected_topics": detected_topics,
223
+ "number_of_clusters": num_clusters,
224
+ }
extract_news.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # extract_news.py
2
+
3
+ # This script is designed to extract news articles from various sources, including NewsAPI and Google News RSS using the URLs saved from the gather_news.py file.
4
+ # It includes functions for extracting clean,full-text content from the articles, and storing the metadata into a file.
5
+
6
+
7
+ # Article Scraping & Text Extraction
8
+
9
+ from newspaper import Article
10
+ import pandas as pd
11
+ import logging
12
+ import requests
13
+ from bs4 import BeautifulSoup
14
+
15
+
16
+ # * For each URL from NewsAPI or RSS, * Create Article(url)* Call .download(), .parse(), .text and * Optionally use .nlp() to get summary and keywords
17
+
18
+ def extract_full_content(url, min_length=300):
19
+ """
20
+ Extract full content and title from the given URL using newspaper3k.
21
+ Always returns a tuple (content, title) or (None, None).
22
+ """
23
+ try:
24
+ article = Article(url)
25
+ article.download()
26
+ article.parse()
27
+
28
+ text = article.text.strip()
29
+ title = article.title.strip() if article.title else "Untitled"
30
+
31
+ # Filter out short content
32
+ if len(text) < min_length:
33
+ logging.warning(f"Extracted content is too short from {url}.")
34
+ return None, None
35
+
36
+ return text, title
37
+
38
+ except Exception as e:
39
+ logging.error(f"Failed to extract content from {url}: {str(e)}")
40
+ return None, None
41
+
42
+
43
+ def extract_full_content_rss(url, min_length=300):
44
+ """
45
+ Extract full content and title from an RSS article using BeautifulSoup.
46
+ Always returns a tuple: (text, title) or (None, None).
47
+ """
48
+ try:
49
+ response = requests.get(url, timeout=10)
50
+ if response.status_code != 200:
51
+ logging.error(f"Error fetching URL {url}: {response.status_code}")
52
+ return None, None
53
+
54
+ soup = BeautifulSoup(response.content, 'html.parser')
55
+ title = soup.title.string.strip() if soup.title and soup.title.string else "Untitled"
56
+ paragraphs = soup.find_all('p')
57
+ text = ' '.join([para.get_text() for para in paragraphs]).strip()
58
+
59
+ if len(text) < min_length:
60
+ logging.warning(f"Extracted content is too short from {url}.")
61
+ return None, None
62
+
63
+ return text, title
64
+
65
+ except Exception as e:
66
+ logging.error(f"Error extracting content from {url}: {str(e)}")
67
+ return None, None
68
+
69
+
70
+ # * Handle common edge cases such as * Paywalled content (skip or tag) and * Duplicate links or broken URLs
71
+ def is_paywalled(url):
72
+ """
73
+ * Check if the URL is paywalled
74
+ """
75
+ paywall_indicators = ['paywall', 'subscription', 'premium']
76
+ return any(indicator in url for indicator in paywall_indicators)
77
+
78
+ def is_paywalled_content(article):
79
+ """
80
+ * Check if the article is paywalled
81
+ """
82
+ if not article:
83
+ return False
84
+ if not article.get("text"):
85
+ return False
86
+ if is_paywalled(article.get("url", "")):
87
+ return True
88
+ return False
89
+
90
+ def is_duplicate(url, existing_urls):
91
+ """
92
+ * Check if the URL is a duplicate
93
+ """
94
+ return url in existing_urls
95
+
96
+ def is_broken(url):
97
+ """
98
+ * Check if the URL is broken
99
+ """
100
+ try:
101
+ response = requests.head(url, allow_redirects=True)
102
+ return response.status_code != 200
103
+ except requests.RequestException:
104
+ return True
105
+
106
+ def is_valid_url(url):
107
+ """
108
+ * Check if the URL is valid
109
+ """
110
+ regex = re.compile(
111
+ r'^(?:http|ftp)s?://' # http:// or https://
112
+ r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
113
+ r'localhost|' # localhost...
114
+ r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
115
+ r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
116
+ r'(?::\d+)?' # optional port
117
+ r'(?:/?|[/?]\S+)$', re.IGNORECASE)
118
+ return re.match(regex, url) is not None
119
+
120
+ def is_valid_url_content(url):
121
+ """
122
+ * Check if the URL is valid
123
+ """
124
+ if not url:
125
+ return False
126
+ if not is_valid_url(url):
127
+ return False
128
+ if is_paywalled(url):
129
+ return False
130
+ if is_broken(url):
131
+ return False
132
+ return True
133
+
134
+ # Additional functions to check if the article have empty content or blocked sites
135
+
136
+ def is_empty_content(article):
137
+ """
138
+ * Check if the article content is empty
139
+ """
140
+ if not article:
141
+ return True
142
+ if not article.get("text"):
143
+ return True
144
+ return False
145
+
146
+ def is_blocked_site(url):
147
+ """
148
+ * Check if the URL is from a blocked site
149
+ """
150
+ blocked_sites = ['example.com', 'blockedsite.com'] # Add your blocked sites here
151
+ return any(blocked_site in url for blocked_site in blocked_sites)
152
+
153
+ def is_blocked_content(article):
154
+ """
155
+ * Check if the article is from a blocked site
156
+ """
157
+ if not article:
158
+ return False
159
+ if not article.get("text"):
160
+ return False
161
+ if is_blocked_site(article.get("url", "")):
162
+ return True
163
+ return False
164
+
165
+ # Extract news articles from the given URLs
166
+
167
+ def extract_news_articles(urls):
168
+ """
169
+ * Extract news articles from the given URLs
170
+ """
171
+ extracted_articles = []
172
+ existing_urls = set()
173
+
174
+ for url in urls:
175
+ if not is_valid_url_content(url):
176
+ logging.warning(f"Skipping invalid or paywalled URL: {url}")
177
+ continue
178
+ if is_duplicate(url, existing_urls):
179
+ logging.warning(f"Skipping duplicate URL: {url}")
180
+ continue
181
+ existing_urls.add(url)
182
+
183
+ article = extract_full_content(url)
184
+ if not article:
185
+ logging.warning(f"Failed to extract content from {url}")
186
+ continue
187
+
188
+ if is_paywalled_content(article):
189
+ logging.warning(f"Skipping paywalled content from URL: {url}")
190
+ continue
191
+
192
+ extracted_articles.append(article)
193
+
194
+ return extracted_articles
195
+
196
+ def extract_news_articles_rss(urls):
197
+ """
198
+ * Extract news articles from the given RSS URLs
199
+ """
200
+ extracted_articles = []
201
+ existing_urls = set()
202
+
203
+ for url in urls:
204
+ if not is_valid_url_content(url):
205
+ logging.warning(f"Skipping invalid or paywalled URL: {url}")
206
+ continue
207
+ if is_duplicate(url, existing_urls):
208
+ logging.warning(f"Skipping duplicate URL: {url}")
209
+ continue
210
+ existing_urls.add(url)
211
+
212
+ article = extract_full_content_rss(url)
213
+ if not article:
214
+ logging.warning(f"Failed to extract content from {url}")
215
+ continue
216
+
217
+ if is_paywalled_content(article):
218
+ logging.warning(f"Skipping paywalled content from URL: {url}")
219
+ continue
220
+
221
+ extracted_articles.append(article)
222
+
223
+ return extracted_articles
224
+
225
+ # Metadata Structuring and Storage
226
+ # Functions to create a dataframe with all the metadata for extracted fields title,url,source,author, published_at and full_text for each extracted article and save it to a csv file
227
+
228
+ def create_dataframe(articles):
229
+ """
230
+ Create a pandas DataFrame from the list of articles.
231
+ """
232
+ return pd.DataFrame(articles)
233
+
234
+ def save_to_csv(df, filename):
235
+ """
236
+ Save the DataFrame to a CSV file.
237
+ """
238
+ df.to_csv(filename, index=False)
239
+
240
+ def save_to_json(df, filename):
241
+ """
242
+ Save the DataFrame to a JSON file.
243
+ """
244
+ df.to_json(filename, orient="records", lines=True)
gather_news.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # gather_news.py
2
+
3
+
4
+ # News Source Integration
5
+ # This script integrates with various news sources to fetch the latest articles from the specified news sources, extracts relevant information such as title, URL,Source,Author and Publish date.
6
+
7
+ import config
8
+ import requests
9
+ import feedparser
10
+
11
+ def fetch_articles_newsapi(topic):
12
+ """
13
+ Fetch articles from NewsAPI based on the provided topic.
14
+ """
15
+ url = 'https://newsapi.org/v2/everything'
16
+ params = {
17
+ 'apiKey': config.api_key,
18
+ 'language': 'en',
19
+ 'q': topic,
20
+ 'pageSize': 20
21
+ }
22
+ try:
23
+ response = requests.get(url, params=params)
24
+ if response.status_code != 200:
25
+ return f"Error: Failed to fetch news. Status code: {response.status_code}"
26
+
27
+ articles = response.json().get("articles", [])
28
+ if not articles:
29
+ return "No articles found."
30
+
31
+ # Extract relevant information from each article
32
+ extracted_articles = []
33
+ for article in articles:
34
+ extracted_articles.append({
35
+ "title": article.get("title", "No title"),
36
+ "url": article.get("url", "#"),
37
+ "source": article.get("source", {}).get("name", "Unknown"),
38
+ "author": article.get("author", "Unknown"),
39
+ "publishedAt": article.get("publishedAt", "Unknown")
40
+ })
41
+
42
+ return extracted_articles
43
+ except Exception as e:
44
+ return f"Error fetching news: {str(e)}"
45
+
46
+ def fetch_articles_google(topic):
47
+ """
48
+ Fetch articles from Google News RSS feed based on the provided topic.
49
+ """
50
+ rss_url = f'https://news.google.com/rss/search?q={topic}&hl=en-US&gl=US&ceid=US:en'
51
+ try:
52
+ feed = feedparser.parse(rss_url)
53
+ if not feed.entries:
54
+ return "No articles found."
55
+
56
+ # Extract relevant information from each article
57
+ extracted_articles = []
58
+ for entry in feed.entries[:20]: # Limit to top 20 articles
59
+ extracted_articles.append({
60
+ "title": entry.title,
61
+ "url": entry.link,
62
+ "source": entry.source.title if hasattr(entry, 'source') else "Unknown",
63
+ "author": entry.author if hasattr(entry, 'author') else "Unknown",
64
+ "publishedAt": entry.published if hasattr(entry, 'published') else "Unknown"
65
+ })
66
+
67
+ return extracted_articles
68
+ except Exception as e:
69
+ return f"Error fetching news: {str(e)}"
70
+
input_topic.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # input_topic.py
2
+
3
+
4
+ # Input Design
5
+ # This script is designed to take user input for a topic or a keyword and validate it before using it in a news summarization application.
6
+
7
+ def get_topic():
8
+ topic = input("Enter a topic to search for news articles: ")
9
+ if not topic:
10
+ print("No topic provided. Please enter a valid topic.")
11
+ return None
12
+ if len(topic) > 100: # Arbitrary limit for topic length
13
+ print("Topic is too long. Please enter a shorter topic.")
14
+ return None
15
+ if not topic.isascii():
16
+ print("Topic contains non-ASCII characters. Please use only ASCII characters.")
17
+ return None
18
+ if not topic.isprintable():
19
+ print("Topic contains non-printable characters. Please use only printable characters.")
20
+ return None
21
+ if topic[0].isdigit():
22
+ print("Topic should not start with a digit. Please enter a valid topic.")
23
+ return None
24
+ if topic[0] == ' ':
25
+ print("Topic should not start with a space. Please enter a valid topic.")
26
+ return None
27
+ # Normalize the input to lowercase and strip any leading/trailing whitespace.
28
+ topic = topic.lower().strip()
29
+ # Check for special characters and replace them with spaces.
30
+ special_chars = ['!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '=', '+', '{', '}', '[', ']', '|', ':', ';', '"', "'", '<', '>', ',', '.', '?', '/', '\\']
31
+ for char in special_chars:
32
+ topic = topic.replace(char, ' ')
33
+ # Remove extra spaces
34
+ topic = ' '.join(topic.split())
35
+ # Check if the topic is empty after normalization
36
+ if not topic:
37
+ print("Topic is empty after normalization. Please enter a valid topic.")
38
+ return None
39
+ # Check for common stop words and remove them
40
+ stop_words = ['the', 'is', 'in', 'and', 'to', 'a', 'of', 'for', 'on', 'with', 'as', 'by', 'this', 'that']
41
+ topic_words = topic.split()
42
+ topic = ' '.join([word for word in topic_words if word not in stop_words])
43
+ # Check if the topic is empty after removing stop words
44
+ if not topic:
45
+ print("Topic is empty after removing stop words. Please enter a valid topic.")
46
+ return None
47
+
48
+ return topic
49
+
50
+
summarizer.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # summarizer.py
2
+ # This script summarizes the content of each article of the specified topic using the Hugging Face Transformers library.
3
+
4
+ from transformers import pipeline
5
+
6
+ # Load summarization pipeline
7
+ summarizer = pipeline("summarization", model="harao-ml/flant5-finetuned-summarize")
8
+
9
+ # Load once globally
10
+
11
+ #from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
12
+
13
+ #tokenizer = AutoTokenizer.from_pretrained("flant5-base")
14
+ #model = AutoModelForSeq2SeqLM.from_pretrained("flant5-base")
15
+ #summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
16
+
17
+ # Function to split text into smaller chunks
18
+ def split_text(text, max_tokens=512):
19
+ words = text.split()
20
+ for i in range(0, len(words), max_tokens):
21
+ yield ' '.join(words[i:i + max_tokens])
22
+
23
+ # Function to clean text
24
+ def clean_text(text):
25
+ text = ' '.join(text.split())
26
+ text = ' '.join(word for word in text.split() if len(word) < 100)
27
+ return text
28
+
29
+ def generate_summary(content):
30
+ try:
31
+ if not content.strip():
32
+ return "No input provided."
33
+ text = content
34
+ cleaned_text = clean_text(text)
35
+ chunks = list(split_text(cleaned_text))
36
+ cons_summary = ''.join([summarizer(chunk, do_sample=False)[0]['summary_text'] for chunk in chunks if chunk.strip()]) if chunks else ''
37
+ summary = summarizer(text, do_sample=False)[0]['summary_text']
38
+ return summary
39
+ except Exception as e:
40
+ return f"Error generating summary: {str(e)}"