gitgato commited on
Commit
5aa51e3
·
verified ·
1 Parent(s): 4bb0f8e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -0
app.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import pipeline
3
+ from datasets import load_dataset
4
+
5
+ pipe = pipeline(
6
+ "automatic-speech-recognition", model="openai/whisper-base"
7
+ )
8
+ def translate(audio):
9
+ outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
10
+ return outputs["text"]
11
+
12
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
13
+
14
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
15
+
16
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
17
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
18
+
19
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
20
+ speaker_embeddings = torch.tensor(embeddings_dataset[144]["gitgato/mabama-tts"]).unsqueeze(0)
21
+
22
+ def synthesise(text):
23
+ inputs = processor(text=text, return_tensors="pt")
24
+ speech = model.generate_speech(
25
+ inputs["input_ids"], speaker_embeddings, vocoder=vocoder
26
+ )
27
+ return speech.cpu()
28
+
29
+ import numpy as np
30
+
31
+ target_dtype = np.int16
32
+ max_range = np.iinfo(target_dtype).max
33
+
34
+
35
+ def speech_to_speech_translation(audio):
36
+ translated_text = translate(audio)
37
+ synthesised_speech = synthesise(translated_text)
38
+ synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
39
+ return 16000, synthesised_speech
40
+
41
+ import gradio as gr
42
+
43
+ demo = gr.Blocks()
44
+
45
+ mic_translate = gr.Interface(
46
+ fn=speech_to_speech_translation,
47
+ inputs=gr.Audio(sources=["microphone"], type="filepath"),
48
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
49
+ )
50
+
51
+ file_translate = gr.Interface(
52
+ fn=speech_to_speech_translation,
53
+ inputs=gr.Audio(sources=["upload"], type="filepath"),
54
+ outputs=gr.Audio(label="Generated Speech", type="numpy"),
55
+ )
56
+
57
+ with demo:
58
+ gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
59
+
60
+ demo.launch(debug=True)