emirhanbilgic commited on
Commit
2301825
·
verified ·
1 Parent(s): 1610722

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -10
app.py CHANGED
@@ -82,33 +82,39 @@ def normalize_text(text):
82
  return text
83
 
84
  @spaces.GPU(duration = 60)
85
- def text_to_speech(text, audio_file):
86
  normalized_text = normalize_text(text)
87
  inputs = processor(text=normalized_text, return_tensors="pt").to(device)
88
 
89
- waveform, sample_rate = sf.read(audio_file)
90
- if len(waveform.shape) > 1:
91
- waveform = waveform[:, 0] # Take the first channel if stereo
92
- if sample_rate != 16000:
93
- print("Warning: The model expects 16kHz sampling rate")
94
- speaker_embeddings = create_speaker_embedding(waveform)
 
 
 
 
 
95
 
96
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
97
  sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
98
  return "output.wav", normalized_text
99
 
 
100
  iface = gr.Interface(
101
  fn=text_to_speech,
102
  inputs=[
103
  gr.Textbox(label="Enter Turkish text to convert to speech"),
104
- gr.Audio(label="Upload a short audio file of the target speaker", type="filepath")
105
  ],
106
  outputs=[
107
  gr.Audio(label="Generated Speech"),
108
  gr.Textbox(label="Normalized Text")
109
  ],
110
  title="Turkish SpeechT5 Text-to-Speech Demo with Custom Speaker",
111
- description="Enter Turkish text, upload a short audio file of the target speaker, and listen to the generated speech using the fine-tuned SpeechT5 model. The text will be normalized for better pronunciation."
112
  )
113
 
114
- iface.launch()
 
82
  return text
83
 
84
  @spaces.GPU(duration = 60)
85
+ def text_to_speech(text, audio_file=None):
86
  normalized_text = normalize_text(text)
87
  inputs = processor(text=normalized_text, return_tensors="pt").to(device)
88
 
89
+ if audio_file is not None:
90
+ waveform, sample_rate = sf.read(audio_file)
91
+ if len(waveform.shape) > 1:
92
+ waveform = waveform[:, 0] # Take the first channel if stereo
93
+ if sample_rate != 16000:
94
+ print("Warning: The model expects 16kHz sampling rate")
95
+ speaker_embeddings = create_speaker_embedding(waveform)
96
+ else:
97
+ # Use a default speaker embedding when no audio file is provided
98
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
99
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
100
 
101
  speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
102
  sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
103
  return "output.wav", normalized_text
104
 
105
+ # Update the Gradio interface
106
  iface = gr.Interface(
107
  fn=text_to_speech,
108
  inputs=[
109
  gr.Textbox(label="Enter Turkish text to convert to speech"),
110
+ gr.Audio(label="Upload a short audio file of the target speaker (optional)", type="filepath")
111
  ],
112
  outputs=[
113
  gr.Audio(label="Generated Speech"),
114
  gr.Textbox(label="Normalized Text")
115
  ],
116
  title="Turkish SpeechT5 Text-to-Speech Demo with Custom Speaker",
117
+ description="Enter Turkish text, optionally upload a short audio file of the target speaker, and listen to the generated speech using the fine-tuned SpeechT5 model. The text will be normalized for better pronunciation."
118
  )
119
 
120
+ iface.launch(share=True)