Spaces:

ganjirajesh
/

text-to-speech

Running

App Files Files Community

text-to-speech / app.py

ganjirajesh

Upload 2 files

b44da57 verified 7 days ago

raw

history blame contribute delete

2.84 kB

	import gradio as gr
	from transformers import pipeline
	import torch
	import soundfile as sf
	import numpy as np
	from datasets import load_dataset

	# Load the TTS pipeline
	# Use the appropriate device (CPU or GPU)
	# You might need to install torch with CUDA support for GPU acceleration
	device = 0 if torch.cuda.is_available() else -1
	pipe = pipeline("text-to-speech", model="microsoft/speecht5_tts", device=device)

	# Load the speaker embedding dataset
	try:
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	except Exception as e:
	print(f"Error loading embeddings dataset: {e}")
	embeddings_dataset = None

	# Function to get a speaker embedding (using a default for now)
	def get_speaker_embedding():
	# Using a default speaker embedding from the dataset
	# You can explore the dataset to find other speaker embeddings
	if embeddings_dataset:
	# Using a speaker that sounds reasonably clear (e.g., 'p335' which is index 730) from cmu-arctic-xvectors
	speaker_embedding = torch.tensor(embeddings_dataset[730]["xvector"]).unsqueeze(0)
	return speaker_embedding
	else:
	# Fallback if dataset loading failed - this might not work well
	print("Warning: Using a placeholder speaker embedding. TTS quality may be poor.")
	# A random tensor as a placeholder - will likely result in poor quality speech
	return torch.randn(1, 512)

	def text_to_speech(text):
	if not text:
	return (None, None)

	speaker_embedding = get_speaker_embedding()

	if speaker_embedding is not None:
	try:
	# Generate speech
	# The pipeline returns a dictionary with 'audio' and 'sampling_rate'
	output = pipe(text, forward_params={"speaker_embeddings": speaker_embedding})
	sampling_rate = output['sampling_rate']
	audio_data = output['audio']

	# Gradio's Audio component expects a tuple (sampling_rate, audio_data)
	return (sampling_rate, audio_data)
	except Exception as e:
	print(f"Error during speech generation: {e}")
	return (None, None)
	else:
	# Return empty output if speaker embedding could not be loaded
	return (None, None)

	# Create the Gradio interface
	interface = gr.Interface(
	fn=text_to_speech,
	inputs=gr.Textbox(label="Enter text"),
	outputs=gr.Audio(label="Generated Speech", autoplay=True),
	title="Text-to-Speech with SpeechT5",
	description="Enter text to convert it to speech using the microsoft/speecht5_tts model. A default speaker is used."
	)

	# Launch the interface
	if __name__ == "__main__":
	# Set share=True to create a shareable link (useful for demos, but be mindful of security/privacy)
	# Set debug=True for more detailed logs
	interface.launch(debug=True)