Spaces:
Running
Running
Upload 2 files
Browse filesinitital commit
- app.py +71 -0
- requirements.txt +6 -0
app.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from transformers import pipeline
|
3 |
+
import torch
|
4 |
+
import soundfile as sf
|
5 |
+
import numpy as np
|
6 |
+
from datasets import load_dataset
|
7 |
+
|
8 |
+
# Load the TTS pipeline
|
9 |
+
# Use the appropriate device (CPU or GPU)
|
10 |
+
# You might need to install torch with CUDA support for GPU acceleration
|
11 |
+
device = 0 if torch.cuda.is_available() else -1
|
12 |
+
pipe = pipeline("text-to-speech", model="microsoft/speecht5_tts", device=device)
|
13 |
+
|
14 |
+
# Load the speaker embedding dataset
|
15 |
+
try:
|
16 |
+
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
17 |
+
except Exception as e:
|
18 |
+
print(f"Error loading embeddings dataset: {e}")
|
19 |
+
embeddings_dataset = None
|
20 |
+
|
21 |
+
# Function to get a speaker embedding (using a default for now)
|
22 |
+
def get_speaker_embedding():
|
23 |
+
# Using a default speaker embedding from the dataset
|
24 |
+
# You can explore the dataset to find other speaker embeddings
|
25 |
+
if embeddings_dataset:
|
26 |
+
# Using a speaker that sounds reasonably clear (e.g., 'p335' which is index 730) from cmu-arctic-xvectors
|
27 |
+
speaker_embedding = torch.tensor(embeddings_dataset[730]["xvector"]).unsqueeze(0)
|
28 |
+
return speaker_embedding
|
29 |
+
else:
|
30 |
+
# Fallback if dataset loading failed - this might not work well
|
31 |
+
print("Warning: Using a placeholder speaker embedding. TTS quality may be poor.")
|
32 |
+
# A random tensor as a placeholder - will likely result in poor quality speech
|
33 |
+
return torch.randn(1, 512)
|
34 |
+
|
35 |
+
def text_to_speech(text):
|
36 |
+
if not text:
|
37 |
+
return (None, None)
|
38 |
+
|
39 |
+
speaker_embedding = get_speaker_embedding()
|
40 |
+
|
41 |
+
if speaker_embedding is not None:
|
42 |
+
try:
|
43 |
+
# Generate speech
|
44 |
+
# The pipeline returns a dictionary with 'audio' and 'sampling_rate'
|
45 |
+
output = pipe(text, forward_params={"speaker_embeddings": speaker_embedding})
|
46 |
+
sampling_rate = output['sampling_rate']
|
47 |
+
audio_data = output['audio']
|
48 |
+
|
49 |
+
# Gradio's Audio component expects a tuple (sampling_rate, audio_data)
|
50 |
+
return (sampling_rate, audio_data)
|
51 |
+
except Exception as e:
|
52 |
+
print(f"Error during speech generation: {e}")
|
53 |
+
return (None, None)
|
54 |
+
else:
|
55 |
+
# Return empty output if speaker embedding could not be loaded
|
56 |
+
return (None, None)
|
57 |
+
|
58 |
+
# Create the Gradio interface
|
59 |
+
interface = gr.Interface(
|
60 |
+
fn=text_to_speech,
|
61 |
+
inputs=gr.Textbox(label="Enter text"),
|
62 |
+
outputs=gr.Audio(label="Generated Speech", autoplay=True),
|
63 |
+
title="Text-to-Speech with SpeechT5",
|
64 |
+
description="Enter text to convert it to speech using the microsoft/speecht5_tts model. A default speaker is used."
|
65 |
+
)
|
66 |
+
|
67 |
+
# Launch the interface
|
68 |
+
if __name__ == "__main__":
|
69 |
+
# Set share=True to create a shareable link (useful for demos, but be mindful of security/privacy)
|
70 |
+
# Set debug=True for more detailed logs
|
71 |
+
interface.launch(debug=True)
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
huggingface_hub
|
2 |
+
gradio
|
3 |
+
transformers
|
4 |
+
torch
|
5 |
+
soundfile
|
6 |
+
datasets
|