Text-to-speech-Turkish

Running

App Files Files Community

emirhanbilgic commited on Aug 29, 2024

Commit

2301825

verified ·

1 Parent(s): 1610722

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -10

app.py CHANGED Viewed

@@ -82,33 +82,39 @@ def normalize_text(text):
     return text
 @spaces.GPU(duration = 60)
-def text_to_speech(text, audio_file):
     normalized_text = normalize_text(text)
     inputs = processor(text=normalized_text, return_tensors="pt").to(device)
-    waveform, sample_rate = sf.read(audio_file)
-    if len(waveform.shape) > 1:
-        waveform = waveform[:, 0]  # Take the first channel if stereo
-    if sample_rate != 16000:
-        print("Warning: The model expects 16kHz sampling rate")
-    speaker_embeddings = create_speaker_embedding(waveform)
     speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
     sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
     return "output.wav", normalized_text
 iface = gr.Interface(
     fn=text_to_speech,
     inputs=[
         gr.Textbox(label="Enter Turkish text to convert to speech"),
-        gr.Audio(label="Upload a short audio file of the target speaker", type="filepath")
     ],
     outputs=[
         gr.Audio(label="Generated Speech"),
         gr.Textbox(label="Normalized Text")
     ],
     title="Turkish SpeechT5 Text-to-Speech Demo with Custom Speaker",
-    description="Enter Turkish text, upload a short audio file of the target speaker, and listen to the generated speech using the fine-tuned SpeechT5 model. The text will be normalized for better pronunciation."
 )
-iface.launch()

     return text
 @spaces.GPU(duration = 60)
+def text_to_speech(text, audio_file=None):
     normalized_text = normalize_text(text)
     inputs = processor(text=normalized_text, return_tensors="pt").to(device)
+    if audio_file is not None:
+        waveform, sample_rate = sf.read(audio_file)
+        if len(waveform.shape) > 1:
+            waveform = waveform[:, 0]  # Take the first channel if stereo
+        if sample_rate != 16000:
+            print("Warning: The model expects 16kHz sampling rate")
+        speaker_embeddings = create_speaker_embedding(waveform)
+    else:
+        # Use a default speaker embedding when no audio file is provided
+        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+        speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
     speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
     sf.write("output.wav", speech.cpu().numpy(), samplerate=16000)
     return "output.wav", normalized_text
+# Update the Gradio interface
 iface = gr.Interface(
     fn=text_to_speech,
     inputs=[
         gr.Textbox(label="Enter Turkish text to convert to speech"),
+        gr.Audio(label="Upload a short audio file of the target speaker (optional)", type="filepath")
     ],
     outputs=[
         gr.Audio(label="Generated Speech"),
         gr.Textbox(label="Normalized Text")
     ],
     title="Turkish SpeechT5 Text-to-Speech Demo with Custom Speaker",
+    description="Enter Turkish text, optionally upload a short audio file of the target speaker, and listen to the generated speech using the fine-tuned SpeechT5 model. The text will be normalized for better pronunciation."
 )
+iface.launch(share=True)