Spaces:

diginoron
/

TTS-EN

Sleeping

App Files Files Community

TTS-EN / app.py

diginoron

Update app.py

cfeeb7f verified 29 days ago

raw

history blame

3.66 kB

	import gradio as gr
	from gtts import gTTS
	import pdfplumber
	from sumy.parsers.plaintext import PlaintextParser
	from sumy.nlp.tokenizers import Tokenizer
	from sumy.summarizers.lsa import LsaSummarizer
	import nltk
	import os

	# Download NLTK data for sumy
	nltk.download('punkt')

	def extract_text_from_pdf(pdf_file):
	"""
	Extract text from a PDF file using pdfplumber.

	Args:
	pdf_file: Uploaded PDF file.

	Returns:
	str: Extracted text from the PDF.
	"""
	try:
	with pdfplumber.open(pdf_file) as pdf:
	text = ""
	for page in pdf.pages:
	text += page.extract_text() or ""
	return text if text else "No text could be extracted from the PDF."
	except Exception as e:
	return f"Error extracting text: {str(e)}"

	def summarize_text(text, sentences_count=12):
	"""
	Summarize text to approximately four paragraphs using sumy LSA summarizer.

	Args:
	text (str): Text to summarize.
	sentences_count (int): Number of sentences in summary (approx. 3 sentences per paragraph).

	Returns:
	str: Summarized text.
	"""
	try:
	# Initialize parser and tokenizer
	parser = PlaintextParser.from_string(text, Tokenizer("english"))
	summarizer = LsaSummarizer()

	# Summarize to specified number of sentences
	summary = summarizer(parser.document, sentences_count)

	# Combine sentences and format into paragraphs (approx. 3 sentences per paragraph)
	summary_text = ""
	for i, sentence in enumerate(summary):
	summary_text += str(sentence) + " "
	if (i + 1) % 3 == 0: # Add paragraph break every 3 sentences
	summary_text += "\n\n"

	return summary_text.strip() if summary_text else "No summary generated."
	except Exception as e:
	return f"Error summarizing text: {str(e)}"

	def pdf_to_speech(pdf_file, lang="en"):
	"""
	Convert text from a PDF to summarized speech using gTTS.

	Args:
	pdf_file: Uploaded PDF file.
	lang (str): Language code (default is 'en' for English).

	Returns:
	tuple: (Path to audio file, summarized text) or (error message, error message).
	"""
	try:
	# Extract text from PDF
	text = extract_text_from_pdf(pdf_file)
	if "Error" in text:
	return text, text

	# Summarize text (approx. 12 sentences for 4 paragraphs)
	summarized_text = summarize_text(text, sentences_count=12)
	if "Error" in summarized_text:
	return summarized_text, summarized_text

	# Create gTTS object
	tts = gTTS(text=summarized_text, lang=lang, slow=False)

	# Save the audio file
	output_file = "output.mp3"
	tts.save(output_file)

	return output_file, summarized_text

	except Exception as e:
	error_msg = f"An error occurred: {str(e)}"
	return error_msg, error_msg

	# Define Gradio interface
	demo = gr.Interface(
	fn=pdf_to_speech,
	inputs=[
	gr.File(label="Upload a PDF file", file_types=[".pdf"]),
	gr.Dropdown(choices=["en", "es", "fr"], label="Select Language", value="en")
	],
	outputs=[
	gr.Audio(label="Generated Speech"),
	gr.Textbox(label="Summarized Text")
	],
	title="PDF Summary to Speech",
	description="Upload an English PDF file, select a language, and generate speech from a summarized version (approx. 4 paragraphs). The summarized text is also displayed."
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()