ganjirajesh commited on
Commit
b44da57
·
verified ·
1 Parent(s): 55bd50c

Upload 2 files

Browse files

initital commit

Files changed (2) hide show
  1. app.py +71 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import torch
4
+ import soundfile as sf
5
+ import numpy as np
6
+ from datasets import load_dataset
7
+
8
+ # Load the TTS pipeline
9
+ # Use the appropriate device (CPU or GPU)
10
+ # You might need to install torch with CUDA support for GPU acceleration
11
+ device = 0 if torch.cuda.is_available() else -1
12
+ pipe = pipeline("text-to-speech", model="microsoft/speecht5_tts", device=device)
13
+
14
+ # Load the speaker embedding dataset
15
+ try:
16
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
17
+ except Exception as e:
18
+ print(f"Error loading embeddings dataset: {e}")
19
+ embeddings_dataset = None
20
+
21
+ # Function to get a speaker embedding (using a default for now)
22
+ def get_speaker_embedding():
23
+ # Using a default speaker embedding from the dataset
24
+ # You can explore the dataset to find other speaker embeddings
25
+ if embeddings_dataset:
26
+ # Using a speaker that sounds reasonably clear (e.g., 'p335' which is index 730) from cmu-arctic-xvectors
27
+ speaker_embedding = torch.tensor(embeddings_dataset[730]["xvector"]).unsqueeze(0)
28
+ return speaker_embedding
29
+ else:
30
+ # Fallback if dataset loading failed - this might not work well
31
+ print("Warning: Using a placeholder speaker embedding. TTS quality may be poor.")
32
+ # A random tensor as a placeholder - will likely result in poor quality speech
33
+ return torch.randn(1, 512)
34
+
35
+ def text_to_speech(text):
36
+ if not text:
37
+ return (None, None)
38
+
39
+ speaker_embedding = get_speaker_embedding()
40
+
41
+ if speaker_embedding is not None:
42
+ try:
43
+ # Generate speech
44
+ # The pipeline returns a dictionary with 'audio' and 'sampling_rate'
45
+ output = pipe(text, forward_params={"speaker_embeddings": speaker_embedding})
46
+ sampling_rate = output['sampling_rate']
47
+ audio_data = output['audio']
48
+
49
+ # Gradio's Audio component expects a tuple (sampling_rate, audio_data)
50
+ return (sampling_rate, audio_data)
51
+ except Exception as e:
52
+ print(f"Error during speech generation: {e}")
53
+ return (None, None)
54
+ else:
55
+ # Return empty output if speaker embedding could not be loaded
56
+ return (None, None)
57
+
58
+ # Create the Gradio interface
59
+ interface = gr.Interface(
60
+ fn=text_to_speech,
61
+ inputs=gr.Textbox(label="Enter text"),
62
+ outputs=gr.Audio(label="Generated Speech", autoplay=True),
63
+ title="Text-to-Speech with SpeechT5",
64
+ description="Enter text to convert it to speech using the microsoft/speecht5_tts model. A default speaker is used."
65
+ )
66
+
67
+ # Launch the interface
68
+ if __name__ == "__main__":
69
+ # Set share=True to create a shareable link (useful for demos, but be mindful of security/privacy)
70
+ # Set debug=True for more detailed logs
71
+ interface.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ huggingface_hub
2
+ gradio
3
+ transformers
4
+ torch
5
+ soundfile
6
+ datasets