File size: 4,170 Bytes
355d903
 
 
 
185fc75
70399da
43f2732
70399da
 
 
 
 
 
 
43f2732
 
687c689
 
43f2732
 
 
3abac7b
43f2732
 
70399da
 
 
 
 
43f2732
 
 
 
70399da
 
401875a
 
 
43f2732
7d147e2
43f2732
70399da
 
43f2732
70399da
987fd27
70399da
 
2c6febe
c741ce2
70399da
185fc75
 
 
70399da
43f2732
 
185fc75
2703cea
70399da
 
 
f828b7a
987fd27
70399da
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import torch
torch.manual_seed(160923)


import gradio as gr
import torch.cuda
from huggingface_hub import hf_hub_download
from InferenceInterfaces.ControllableInterface import ControllableInterface
from Utility.utils import float2pcm
from Utility.utils import load_json_from_path


class TTSWebUI:

    def __init__(self,
                 gpu_id="cpu",
                 title="Phoneme Synthesis with Neutral Accent and Many Speakers",
                 article="Put in a string of IPA characters and have it pronounced in a way that is averaged across many languages. Use ~ to get a pause and include any punctuation marks you would normally use. If you enable the checkbox, the model will take much longer, but the result will be spoken by 10 artificial voices at the same time. <br>",
                 tts_model_path=None,
                 vocoder_model_path=None,
                 embedding_gan_path=None,
                 available_artificial_voices=10  # be careful with this, if you want too many, it might lead to an endless loop
                 ):
        path_to_iso_list = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="iso_to_fullname.json")
        iso_to_name = load_json_from_path(path_to_iso_list)
        text_selection = [f"{iso_to_name[iso_code]} ({iso_code})" for iso_code in iso_to_name]
        # accent_selection = [f"{iso_to_name[iso_code]} Accent ({iso_code})" for iso_code in iso_to_name]

        self.controllable_ui = ControllableInterface(gpu_id=gpu_id,
                                                     available_artificial_voices=available_artificial_voices,
                                                     tts_model_path=tts_model_path,
                                                     vocoder_model_path=vocoder_model_path,
                                                     embedding_gan_path=embedding_gan_path)
        self.iface = gr.Interface(fn=self.read,
                                  inputs=[gr.Textbox(lines=2,
                                                     placeholder="put in IPA symbols here...",
                                                     value="~tə ɡɛt ɐ pˈɔːz~ plˈeɪs ɐ tˈɪldə sˈɪmbəl. jˈuːs pʌŋktʃuːˈeɪʃən~ æz ɪf ðɪs wʌz tˈɛkst.~#",
                                                     label="IPA Input"),
                                          gr.Slider(minimum=0.0, maximum=0.8, step=0.1, value=0.5, label="Prosody Creativity"),
                                          gr.Checkbox(value=False, label="Speak in many Voices"),
                                          gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone (if left empty, will use an artificial voice instead)"),
                                          # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Pitch Variance Scale"),
                                          # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
                                          # gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
                                          ],
                                  outputs=[gr.Audio(type="numpy", label="Speech")],
                                  title=title,
                                  allow_flagging="never",
                                  description=article,
                                  theme=gr.themes.Ocean(primary_hue="amber", secondary_hue="orange"))
        self.iface.launch()

    def read(self,
             prompt,
             prosody_creativity,
             voice_seed,
             reference_audio,
             ):
        sr, wav = self.controllable_ui.read("~" + prompt.replace(",", "~") + "~#",
                                                 reference_audio,
                                                 voice_seed,
                                                 prosody_creativity,
                                                 -24.)
        return (sr, float2pcm(wav))


if __name__ == '__main__':
    TTSWebUI(gpu_id="cuda" if torch.cuda.is_available() else "cpu")