import gradio as gr from transformers import pipeline import torch import soundfile as sf import numpy as np from datasets import load_dataset # Load the TTS pipeline # Use the appropriate device (CPU or GPU) # You might need to install torch with CUDA support for GPU acceleration device = 0 if torch.cuda.is_available() else -1 pipe = pipeline("text-to-speech", model="microsoft/speecht5_tts", device=device) # Load the speaker embedding dataset try: embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") except Exception as e: print(f"Error loading embeddings dataset: {e}") embeddings_dataset = None # Function to get a speaker embedding (using a default for now) def get_speaker_embedding(): # Using a default speaker embedding from the dataset # You can explore the dataset to find other speaker embeddings if embeddings_dataset: # Using a speaker that sounds reasonably clear (e.g., 'p335' which is index 730) from cmu-arctic-xvectors speaker_embedding = torch.tensor(embeddings_dataset[730]["xvector"]).unsqueeze(0) return speaker_embedding else: # Fallback if dataset loading failed - this might not work well print("Warning: Using a placeholder speaker embedding. TTS quality may be poor.") # A random tensor as a placeholder - will likely result in poor quality speech return torch.randn(1, 512) def text_to_speech(text): if not text: return (None, None) speaker_embedding = get_speaker_embedding() if speaker_embedding is not None: try: # Generate speech # The pipeline returns a dictionary with 'audio' and 'sampling_rate' output = pipe(text, forward_params={"speaker_embeddings": speaker_embedding}) sampling_rate = output['sampling_rate'] audio_data = output['audio'] # Gradio's Audio component expects a tuple (sampling_rate, audio_data) return (sampling_rate, audio_data) except Exception as e: print(f"Error during speech generation: {e}") return (None, None) else: # Return empty output if speaker embedding could not be loaded return (None, None) # Create the Gradio interface interface = gr.Interface( fn=text_to_speech, inputs=gr.Textbox(label="Enter text"), outputs=gr.Audio(label="Generated Speech", autoplay=True), title="Text-to-Speech with SpeechT5", description="Enter text to convert it to speech using the microsoft/speecht5_tts model. A default speaker is used." ) # Launch the interface if __name__ == "__main__": # Set share=True to create a shareable link (useful for demos, but be mindful of security/privacy) # Set debug=True for more detailed logs interface.launch(debug=True)