text-to-speech / app.py
ganjirajesh's picture
Upload 2 files
b44da57 verified
import gradio as gr
from transformers import pipeline
import torch
import soundfile as sf
import numpy as np
from datasets import load_dataset
# Load the TTS pipeline
# Use the appropriate device (CPU or GPU)
# You might need to install torch with CUDA support for GPU acceleration
device = 0 if torch.cuda.is_available() else -1
pipe = pipeline("text-to-speech", model="microsoft/speecht5_tts", device=device)
# Load the speaker embedding dataset
try:
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
except Exception as e:
print(f"Error loading embeddings dataset: {e}")
embeddings_dataset = None
# Function to get a speaker embedding (using a default for now)
def get_speaker_embedding():
# Using a default speaker embedding from the dataset
# You can explore the dataset to find other speaker embeddings
if embeddings_dataset:
# Using a speaker that sounds reasonably clear (e.g., 'p335' which is index 730) from cmu-arctic-xvectors
speaker_embedding = torch.tensor(embeddings_dataset[730]["xvector"]).unsqueeze(0)
return speaker_embedding
else:
# Fallback if dataset loading failed - this might not work well
print("Warning: Using a placeholder speaker embedding. TTS quality may be poor.")
# A random tensor as a placeholder - will likely result in poor quality speech
return torch.randn(1, 512)
def text_to_speech(text):
if not text:
return (None, None)
speaker_embedding = get_speaker_embedding()
if speaker_embedding is not None:
try:
# Generate speech
# The pipeline returns a dictionary with 'audio' and 'sampling_rate'
output = pipe(text, forward_params={"speaker_embeddings": speaker_embedding})
sampling_rate = output['sampling_rate']
audio_data = output['audio']
# Gradio's Audio component expects a tuple (sampling_rate, audio_data)
return (sampling_rate, audio_data)
except Exception as e:
print(f"Error during speech generation: {e}")
return (None, None)
else:
# Return empty output if speaker embedding could not be loaded
return (None, None)
# Create the Gradio interface
interface = gr.Interface(
fn=text_to_speech,
inputs=gr.Textbox(label="Enter text"),
outputs=gr.Audio(label="Generated Speech", autoplay=True),
title="Text-to-Speech with SpeechT5",
description="Enter text to convert it to speech using the microsoft/speecht5_tts model. A default speaker is used."
)
# Launch the interface
if __name__ == "__main__":
# Set share=True to create a shareable link (useful for demos, but be mindful of security/privacy)
# Set debug=True for more detailed logs
interface.launch(debug=True)