Spaces:

ganjirajesh
/

text-to-speech

Running

App Files Files Community

ganjirajesh commited on 9 days ago

Commit

b44da57

verified ·

1 Parent(s): 55bd50c

Upload 2 files

Browse files

initital commit

Files changed (2) hide show

app.py +71 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import gradio as gr
+from transformers import pipeline
+import torch
+import soundfile as sf
+import numpy as np
+from datasets import load_dataset
+# Load the TTS pipeline
+# Use the appropriate device (CPU or GPU)
+# You might need to install torch with CUDA support for GPU acceleration
+device = 0 if torch.cuda.is_available() else -1
+pipe = pipeline("text-to-speech", model="microsoft/speecht5_tts", device=device)
+# Load the speaker embedding dataset
+try:
+    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+except Exception as e:
+    print(f"Error loading embeddings dataset: {e}")
+    embeddings_dataset = None
+# Function to get a speaker embedding (using a default for now)
+def get_speaker_embedding():
+    # Using a default speaker embedding from the dataset
+    # You can explore the dataset to find other speaker embeddings
+    if embeddings_dataset:
+        # Using a speaker that sounds reasonably clear (e.g., 'p335' which is index 730) from cmu-arctic-xvectors
+        speaker_embedding = torch.tensor(embeddings_dataset[730]["xvector"]).unsqueeze(0)
+        return speaker_embedding
+    else:
+        # Fallback if dataset loading failed - this might not work well
+        print("Warning: Using a placeholder speaker embedding. TTS quality may be poor.")
+        # A random tensor as a placeholder - will likely result in poor quality speech
+        return torch.randn(1, 512)
+def text_to_speech(text):
+    if not text:
+        return (None, None)
+    speaker_embedding = get_speaker_embedding()
+    if speaker_embedding is not None:
+        try:
+            # Generate speech
+            # The pipeline returns a dictionary with 'audio' and 'sampling_rate'
+            output = pipe(text, forward_params={"speaker_embeddings": speaker_embedding})
+            sampling_rate = output['sampling_rate']
+            audio_data = output['audio']
+            # Gradio's Audio component expects a tuple (sampling_rate, audio_data)
+            return (sampling_rate, audio_data)
+        except Exception as e:
+            print(f"Error during speech generation: {e}")
+            return (None, None)
+    else:
+        # Return empty output if speaker embedding could not be loaded
+        return (None, None)
+# Create the Gradio interface
+interface = gr.Interface(
+    fn=text_to_speech,
+    inputs=gr.Textbox(label="Enter text"),
+    outputs=gr.Audio(label="Generated Speech", autoplay=True),
+    title="Text-to-Speech with SpeechT5",
+    description="Enter text to convert it to speech using the microsoft/speecht5_tts model. A default speaker is used."
+)
+# Launch the interface
+if __name__ == "__main__":
+    # Set share=True to create a shareable link (useful for demos, but be mindful of security/privacy)
+    # Set debug=True for more detailed logs
+    interface.launch(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+huggingface_hub
+gradio
+transformers
+torch
+soundfile
+datasets