|
import gradio as gr |
|
import requests |
|
from newspaper import Article |
|
from transformers import pipeline |
|
import nltk |
|
import os |
|
import PyPDF2 |
|
|
|
|
|
summarizer = pipeline("summarization", model="harao-ml/flant5-finetuned-summarize") |
|
|
|
|
|
def split_text(text, max_tokens=512): |
|
words = text.split() |
|
for i in range(0, len(words), max_tokens): |
|
yield ' '.join(words[i:i + max_tokens]) |
|
|
|
|
|
def clean_text(text): |
|
text = ' '.join(text.split()) |
|
text = ' '.join(word for word in text.split() if len(word) < 100) |
|
return text |
|
|
|
|
|
|
|
def fetch_article_details(url): |
|
try: |
|
article = Article(url) |
|
article.download() |
|
article.parse() |
|
title = article.title or "Untitled" |
|
author = ", ".join(article.authors) if article.authors else "Unknown" |
|
pub_date = article.publish_date.strftime('%B %d, %Y') if article.publish_date else "Unknown" |
|
return title, author, pub_date, article.text |
|
except Exception as e: |
|
return None, None, None, f"Error fetching article: {str(e)}" |
|
|
|
|
|
def generate_summary(content): |
|
if not content.strip(): |
|
return "No input provided." |
|
text = content |
|
cleaned_text = clean_text(text) |
|
chunks = list(split_text(cleaned_text)) |
|
cons_summary = ''.join([summarizer(chunk, do_sample=False)[0]['summary_text'] for chunk in chunks if chunk.strip()]) if chunks else '' |
|
summary = summarizer(text, do_sample=False)[0]['summary_text'] |
|
return summary |
|
|
|
|
|
def summarize_input(mixed_input): |
|
if mixed_input.startswith("http://") or mixed_input.startswith("https://"): |
|
title, author, pub_date, content = fetch_article_details(mixed_input) |
|
if content.startswith("Error"): |
|
return f"### Error\n\n{content}" |
|
summary = generate_summary(content) |
|
return f"**Title:** {title}\n\n**Author(s):** {author}\n\n**Published:** {pub_date}\n\n**π Summary** \n\n{summary}\n\n[π Read more]({mixed_input})\n\n---" |
|
else: |
|
summary = generate_summary(mixed_input) |
|
return f"## π Summary \n\n{summary}\n\nπ **Original Text:**\n\n{mixed_input}\n\n---" |
|
|
|
|
|
def summarize_file(file): |
|
try: |
|
if file is None: |
|
return "" |
|
|
|
text = "" |
|
if file.name.endswith(".pdf"): |
|
with open(file.name, "rb") as f: |
|
reader = PyPDF2.PdfReader(f) |
|
for page in reader.pages: |
|
text += page.extract_text() or "" |
|
elif file.name.endswith(".txt"): |
|
with open(file.name, "r", encoding="utf-8") as f: |
|
text = f.read() |
|
else: |
|
return "β Unsupported file type." |
|
|
|
if not text.strip(): |
|
return "β No text found in file." |
|
|
|
summary = generate_summary(text) |
|
original_text = text |
|
|
|
|
|
result = ( |
|
f"### π Summary\n\n" |
|
f"{summary}\n\n" |
|
f"---\n\n" |
|
f"π **Original Extracted Text:**\n\n{original_text}" |
|
) |
|
return result |
|
except Exception as e: |
|
return f"β Error processing file: {str(e)}" |
|
|
|
|
|
|
|
|
|
def fetch_news(): |
|
url = 'https://newsapi.org/v2/top-headlines' |
|
api_key = os.environ.get("api_key") |
|
params = { |
|
'apiKey': api_key, |
|
'language': 'en', |
|
'sources': 'associated-press', |
|
'pageSize': 10 |
|
} |
|
try: |
|
response = requests.get(url, params=params) |
|
if response.status_code != 200: |
|
return f"Error: Failed to fetch news. Status code: {response.status_code}" |
|
|
|
articles = response.json().get("articles", []) |
|
summaries = [f'## π° Top Stories - Instant Insights\n\n'] |
|
for article in articles: |
|
title = article.get("title", "No title") |
|
article_url = article.get("url", "#") |
|
author = article.get("author", "Unknown") |
|
pub_date = article.get("publishedAt", "Unknown") |
|
content = extract_full_content(article_url) or article.get("content") or article.get("description") or "" |
|
summary = generate_summary(content) |
|
summaries.append(f"**{title}** \n\n**Author(s):** {author}\n\n**Published:** {pub_date}\n\n**π Summary:** {summary}\n\n [π Read more]({article_url})\n\n---") |
|
|
|
if not summaries: |
|
return "### No articles could be summarized." |
|
return "\n\n".join(summaries) |
|
except Exception as e: |
|
return f"### Error fetching news\n\n{str(e)}" |
|
|
|
|
|
def extract_full_content(url): |
|
try: |
|
article = Article(url) |
|
article.download() |
|
article.parse() |
|
return article.text |
|
except Exception: |
|
return None |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Default(font="Arial", font_mono="Courier New")) as demo: |
|
|
|
gr.Markdown("# π° Sum Up! Stay Informed, Instantly") |
|
gr.Markdown("### FLAN-T5-Driven Summarizer for Multi-Format Content") |
|
gr.Markdown("Sum Up! effectively distills lengthy content into clear, concise summaries with just a text input, file upload, or URL. Stay informed with instant access to auto-summarized top news headlinesβall in just one click.") |
|
|
|
|
|
gr.Markdown("---") |
|
with gr.Row(): |
|
|
|
with gr.Column(scale=1, min_width=300): |
|
with gr.Accordion("π’ News at a Glance", open=False): |
|
gr.Markdown("**Source: Associated Press**") |
|
gr.Markdown( |
|
"Click to get today's top news from the Associated Press, simplified and ready to read.") |
|
news_btn = gr.Button("β‘ News Now", variant="primary", elem_id="news-now-btn") |
|
|
|
|
|
with gr.Column(scale=2, min_width=400): |
|
gr.Markdown("### Provide content to summarize") |
|
gr.Markdown("#### Enter Text or URL") |
|
input_box = gr.Textbox( |
|
label="Enter URL or Text", |
|
placeholder="Paste a URL or text here...", |
|
lines=5, |
|
) |
|
summarize_btn = gr.Button("π Summarize", variant="primary", elem_id="summarize-btn") |
|
|
|
|
|
clear_btn = gr.Button("Clear", variant="secondary", elem_id="clear-btn") |
|
|
|
gr.Markdown("#### Upload a File") |
|
file_input = gr.File( |
|
label="Upload a .pdf or .txt file", file_types=[".pdf", ".txt"] |
|
) |
|
gr.Markdown("**Note:** Only PDF and TXT files are supported.") |
|
|
|
|
|
gr.Markdown("---") |
|
gr.Markdown("### π‘ Key Takeaways") |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
gen_output = gr.Markdown() |
|
|
|
|
|
summarize_btn.click(fn=summarize_input, inputs=input_box, outputs=gen_output) |
|
file_input.change(fn=summarize_file, inputs=file_input, outputs=gen_output) |
|
news_btn.click(fn=fetch_news, inputs=[], outputs=gen_output) |
|
|
|
|
|
clear_btn.click( |
|
fn=lambda: ("", None, ""), |
|
inputs=[], |
|
outputs=[input_box, file_input, gen_output], |
|
) |
|
|
|
|
|
gen_output = gr.Markdown(value="") |
|
|
|
|
|
css = """ |
|
#summarize-btn { |
|
background-color: #4CAF50 !important; /* Green for Summarize */ |
|
color: white !important; |
|
font-size: 16px !important; |
|
padding: 10px 20px !important; |
|
border-radius: 5px !important; |
|
margin-top: 20px !important; |
|
width: 100%; |
|
} |
|
|
|
#news-now-btn { |
|
background-color: #0078D7 !important; /* Blue for News Now */ |
|
color: white !important; |
|
font-size: 16px !important; |
|
padding: 10px 20px !important; |
|
border-radius: 5px !important; |
|
margin-top: 20px !important; |
|
width: 100%; |
|
} |
|
|
|
#clear-btn { |
|
background-color: #d6d8db !important; /* Lighter Gray for Clear */ |
|
color: black !important; |
|
font-size: 16px !important; |
|
padding: 10px 20px !important; |
|
border-radius: 5px !important; |
|
margin-top: 20px !important; |
|
width: 100%; |
|
} |
|
""" |
|
|
|
|
|
demo.css = css |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |