File size: 2,835 Bytes
b44da57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import gradio as gr
from transformers import pipeline
import torch
import soundfile as sf
import numpy as np
from datasets import load_dataset

# Load the TTS pipeline
# Use the appropriate device (CPU or GPU)
# You might need to install torch with CUDA support for GPU acceleration
device = 0 if torch.cuda.is_available() else -1
pipe = pipeline("text-to-speech", model="microsoft/speecht5_tts", device=device)

# Load the speaker embedding dataset
try:
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
except Exception as e:
    print(f"Error loading embeddings dataset: {e}")
    embeddings_dataset = None

# Function to get a speaker embedding (using a default for now)
def get_speaker_embedding():
    # Using a default speaker embedding from the dataset
    # You can explore the dataset to find other speaker embeddings
    if embeddings_dataset:
        # Using a speaker that sounds reasonably clear (e.g., 'p335' which is index 730) from cmu-arctic-xvectors
        speaker_embedding = torch.tensor(embeddings_dataset[730]["xvector"]).unsqueeze(0)
        return speaker_embedding
    else:
        # Fallback if dataset loading failed - this might not work well
        print("Warning: Using a placeholder speaker embedding. TTS quality may be poor.")
        # A random tensor as a placeholder - will likely result in poor quality speech
        return torch.randn(1, 512)

def text_to_speech(text):
    if not text:
        return (None, None)

    speaker_embedding = get_speaker_embedding()

    if speaker_embedding is not None:
        try:
            # Generate speech
            # The pipeline returns a dictionary with 'audio' and 'sampling_rate'
            output = pipe(text, forward_params={"speaker_embeddings": speaker_embedding})
            sampling_rate = output['sampling_rate']
            audio_data = output['audio']

            # Gradio's Audio component expects a tuple (sampling_rate, audio_data)
            return (sampling_rate, audio_data)
        except Exception as e:
            print(f"Error during speech generation: {e}")
            return (None, None)
    else:
        # Return empty output if speaker embedding could not be loaded
        return (None, None)

# Create the Gradio interface
interface = gr.Interface(
    fn=text_to_speech,
    inputs=gr.Textbox(label="Enter text"),
    outputs=gr.Audio(label="Generated Speech", autoplay=True),
    title="Text-to-Speech with SpeechT5",
    description="Enter text to convert it to speech using the microsoft/speecht5_tts model. A default speaker is used."
)

# Launch the interface
if __name__ == "__main__":
    # Set share=True to create a shareable link (useful for demos, but be mindful of security/privacy)
    # Set debug=True for more detailed logs
    interface.launch(debug=True)