Spaces:
Running
Running
import gradio as gr | |
from transformers import pipeline | |
import torch | |
import soundfile as sf | |
import numpy as np | |
from datasets import load_dataset | |
# Load the TTS pipeline | |
# Use the appropriate device (CPU or GPU) | |
# You might need to install torch with CUDA support for GPU acceleration | |
device = 0 if torch.cuda.is_available() else -1 | |
pipe = pipeline("text-to-speech", model="microsoft/speecht5_tts", device=device) | |
# Load the speaker embedding dataset | |
try: | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
except Exception as e: | |
print(f"Error loading embeddings dataset: {e}") | |
embeddings_dataset = None | |
# Function to get a speaker embedding (using a default for now) | |
def get_speaker_embedding(): | |
# Using a default speaker embedding from the dataset | |
# You can explore the dataset to find other speaker embeddings | |
if embeddings_dataset: | |
# Using a speaker that sounds reasonably clear (e.g., 'p335' which is index 730) from cmu-arctic-xvectors | |
speaker_embedding = torch.tensor(embeddings_dataset[730]["xvector"]).unsqueeze(0) | |
return speaker_embedding | |
else: | |
# Fallback if dataset loading failed - this might not work well | |
print("Warning: Using a placeholder speaker embedding. TTS quality may be poor.") | |
# A random tensor as a placeholder - will likely result in poor quality speech | |
return torch.randn(1, 512) | |
def text_to_speech(text): | |
if not text: | |
return (None, None) | |
speaker_embedding = get_speaker_embedding() | |
if speaker_embedding is not None: | |
try: | |
# Generate speech | |
# The pipeline returns a dictionary with 'audio' and 'sampling_rate' | |
output = pipe(text, forward_params={"speaker_embeddings": speaker_embedding}) | |
sampling_rate = output['sampling_rate'] | |
audio_data = output['audio'] | |
# Gradio's Audio component expects a tuple (sampling_rate, audio_data) | |
return (sampling_rate, audio_data) | |
except Exception as e: | |
print(f"Error during speech generation: {e}") | |
return (None, None) | |
else: | |
# Return empty output if speaker embedding could not be loaded | |
return (None, None) | |
# Create the Gradio interface | |
interface = gr.Interface( | |
fn=text_to_speech, | |
inputs=gr.Textbox(label="Enter text"), | |
outputs=gr.Audio(label="Generated Speech", autoplay=True), | |
title="Text-to-Speech with SpeechT5", | |
description="Enter text to convert it to speech using the microsoft/speecht5_tts model. A default speaker is used." | |
) | |
# Launch the interface | |
if __name__ == "__main__": | |
# Set share=True to create a shareable link (useful for demos, but be mindful of security/privacy) | |
# Set debug=True for more detailed logs | |
interface.launch(debug=True) | |