Spaces:

harao-ml
/

SumUp

Running

App Files Files Community

SumUp / app.py

harao-ml

Update app.py

1c40300 verified about 2 months ago

raw

history blame contribute delete

8.87 kB

	import gradio as gr
	import requests
	from newspaper import Article
	from transformers import pipeline
	import nltk
	import os
	import PyPDF2

	# Load summarization pipeline
	summarizer = pipeline("summarization", model="harao-ml/flant5-finetuned-summarize")

	# Function to split text into smaller chunks
	def split_text(text, max_tokens=512):
	words = text.split()
	for i in range(0, len(words), max_tokens):
	yield ' '.join(words[i:i + max_tokens])

	# Function to clean text
	def clean_text(text):
	text = ' '.join(text.split())
	text = ' '.join(word for word in text.split() if len(word) < 100)
	return text


	# Helper function to fetch and parse an article from a URL
	def fetch_article_details(url):
	try:
	article = Article(url)
	article.download()
	article.parse()
	title = article.title or "Untitled"
	author = ", ".join(article.authors) if article.authors else "Unknown"
	pub_date = article.publish_date.strftime('%B %d, %Y') if article.publish_date else "Unknown"
	return title, author, pub_date, article.text
	except Exception as e:
	return None, None, None, f"Error fetching article: {str(e)}"

	# Helper function to generate a summary
	def generate_summary(content):
	if not content.strip():
	return "No input provided."
	text = content
	cleaned_text = clean_text(text)
	chunks = list(split_text(cleaned_text))
	cons_summary = ''.join([summarizer(chunk, do_sample=False)[0]['summary_text'] for chunk in chunks if chunk.strip()]) if chunks else ''
	summary = summarizer(text, do_sample=False)[0]['summary_text']
	return summary

	# Summarize from text or URL
	def summarize_input(mixed_input):
	if mixed_input.startswith("http://") or mixed_input.startswith("https://"):
	title, author, pub_date, content = fetch_article_details(mixed_input)
	if content.startswith("Error"):
	return f"### Error\n\n{content}"
	summary = generate_summary(content)
	return f"Title: {title}\n\nAuthor(s): {author}\n\nPublished: {pub_date}\n\n📝 Summary \n\n{summary}\n\n[🔗 Read more]({mixed_input})\n\n---"
	else:
	summary = generate_summary(mixed_input)
	return f"## 📝 Summary \n\n{summary}\n\n📎 Original Text:\n\n{mixed_input}\n\n---"

	# Function to summarize a file (PDF or TXT)
	def summarize_file(file):
	try:
	if file is None: # Handle the case where no file is provided
	return "" # Return an empty string instead of an error message

	text = ""
	if file.name.endswith(".pdf"):
	with open(file.name, "rb") as f:
	reader = PyPDF2.PdfReader(f)
	for page in reader.pages:
	text += page.extract_text() or ""
	elif file.name.endswith(".txt"):
	with open(file.name, "r", encoding="utf-8") as f:
	text = f.read()
	else:
	return "❌ Unsupported file type."

	if not text.strip():
	return "❌ No text found in file."

	summary = generate_summary(text)
	original_text = text

	# Combine the outputs into a single string
	result = (
	f"### 📝 Summary\n\n"
	f"{summary}\n\n"
	f"---\n\n"
	f"📎 Original Extracted Text:\n\n{original_text}"
	)
	return result
	except Exception as e:
	return f"❌ Error processing file: {str(e)}"



	# Function to fetch top headlines from NewsAPI and summarize them
	def fetch_news():
	url = 'https://newsapi.org/v2/top-headlines'
	api_key = os.environ.get("api_key")
	params = {
	'apiKey': api_key,
	'language': 'en',
	'sources': 'associated-press',
	'pageSize': 10
	}
	try:
	response = requests.get(url, params=params)
	if response.status_code != 200:
	return f"Error: Failed to fetch news. Status code: {response.status_code}"

	articles = response.json().get("articles", [])
	summaries = [f'## 📰 Top Stories - Instant Insights\n\n']
	for article in articles:
	title = article.get("title", "No title")
	article_url = article.get("url", "#")
	author = article.get("author", "Unknown")
	pub_date = article.get("publishedAt", "Unknown")
	content = extract_full_content(article_url) or article.get("content") or article.get("description") or ""
	summary = generate_summary(content)
	summaries.append(f"{title} \n\nAuthor(s): {author}\n\nPublished: {pub_date}\n\n📝 Summary: {summary}\n\n [🔗 Read more]({article_url})\n\n---")

	if not summaries:
	return "### No articles could be summarized."
	return "\n\n".join(summaries)
	except Exception as e:
	return f"### Error fetching news\n\n{str(e)}"

	# Helper function to extract full content using newspaper3k
	def extract_full_content(url):
	try:
	article = Article(url)
	article.download()
	article.parse()
	return article.text
	except Exception:
	return None

	# Gradio interface
	with gr.Blocks(theme=gr.themes.Default(font="Arial", font_mono="Courier New")) as demo:
	# Header Section
	gr.Markdown("# 📰 Sum Up! Stay Informed, Instantly")
	gr.Markdown("### FLAN-T5-Driven Summarizer for Multi-Format Content")
	gr.Markdown("Sum Up! effectively distills lengthy content into clear, concise summaries with just a text input, file upload, or URL. Stay informed with instant access to auto-summarized top news headlines—all in just one click.")

	# Input Section
	gr.Markdown("---") # Horizontal line for separation
	with gr.Row():
	# Left Column: Collapsible Sidebar for Latest News
	with gr.Column(scale=1, min_width=300):
	with gr.Accordion("📢 News at a Glance", open=False):
	gr.Markdown("Source: Associated Press")
	gr.Markdown(
	"Click to get today's top news from the Associated Press, simplified and ready to read.")
	news_btn = gr.Button("⚡ News Now", variant="primary", elem_id="news-now-btn")

	# Right Column: Text Input and File Upload
	with gr.Column(scale=2, min_width=400):
	gr.Markdown("### Provide content to summarize")
	gr.Markdown("#### Enter Text or URL")
	input_box = gr.Textbox(
	label="Enter URL or Text",
	placeholder="Paste a URL or text here...",
	lines=5,
	)
	summarize_btn = gr.Button("🔍 Summarize", variant="primary", elem_id="summarize-btn")

	# Clear Button placed below the Summarize button
	clear_btn = gr.Button("Clear", variant="secondary", elem_id="clear-btn")

	gr.Markdown("#### Upload a File")
	file_input = gr.File(
	label="Upload a .pdf or .txt file", file_types=[".pdf", ".txt"]
	)
	gr.Markdown("Note: Only PDF and TXT files are supported.")

	# Output Section
	gr.Markdown("---") # Horizontal line for separation
	gr.Markdown("### 💡 Key Takeaways")
	with gr.Row():
	with gr.Column(scale=1):
	gen_output = gr.Markdown() # Use a valid output component

	# Link buttons to their respective functions
	summarize_btn.click(fn=summarize_input, inputs=input_box, outputs=gen_output)
	file_input.change(fn=summarize_file, inputs=file_input, outputs=gen_output)
	news_btn.click(fn=fetch_news, inputs=[], outputs=gen_output)

	# Clear button functionality
	clear_btn.click(
	fn=lambda: ("", None, ""), # Clear all inputs and outputs
	inputs=[],
	outputs=[input_box, file_input, gen_output],
	)

	# Ensure gen_output is properly reset
	gen_output = gr.Markdown(value="") # Initialize with an empty value

	# Add custom CSS for better styling
	css = """
	#summarize-btn {
	background-color: #4CAF50 !important; /* Green for Summarize */
	color: white !important;
	font-size: 16px !important;
	padding: 10px 20px !important;
	border-radius: 5px !important;
	margin-top: 20px !important;
	width: 100%;
	}

	#news-now-btn {
	background-color: #0078D7 !important; /* Blue for News Now */
	color: white !important;
	font-size: 16px !important;
	padding: 10px 20px !important;
	border-radius: 5px !important;
	margin-top: 20px !important;
	width: 100%;
	}

	#clear-btn {
	background-color: #d6d8db !important; /* Lighter Gray for Clear */
	color: black !important;
	font-size: 16px !important;
	padding: 10px 20px !important;
	border-radius: 5px !important;
	margin-top: 20px !important;
	width: 100%;
	}
	"""

	# Apply the custom CSS
	demo.css = css

	if __name__ == "__main__":
	demo.launch()