File size: 5,168 Bytes
9e275b8
 
db5766e
9e275b8
 
70399da
9e275b8
 
 
 
db5766e
9e275b8
 
db5766e
 
 
9e275b8
 
db5766e
 
70399da
db5766e
 
9e275b8
 
 
 
 
 
 
70399da
9e275b8
70399da
 
9e275b8
401875a
 
70399da
401875a
 
 
 
 
 
 
 
 
 
 
 
 
7d147e2
401875a
 
 
 
 
 
 
 
 
987fd27
 
 
 
 
 
 
 
 
e50b1be
 
 
401875a
987fd27
70399da
 
9e275b8
701af8c
987fd27
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import torch
from huggingface_hub import hf_hub_download

from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
from Modules.ControllabilityGAN.GAN import GanWrapper


class ControllableInterface:

    def __init__(self, gpu_id="cpu", available_artificial_voices=50, tts_model_path=None, vocoder_model_path=None, embedding_gan_path=None):
        if gpu_id == "cpu":
            os.environ["CUDA_VISIBLE_DEVICES"] = ""
        elif gpu_id == "cuda":
            pass
        else:  # in this case we hopefully got a number.
            os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
            os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
        if embedding_gan_path is None:
            embedding_gan_path = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="embedding_gan.pt")
        self.device = "cuda" if gpu_id != "cpu" else "cpu"
        self.model = ToucanTTSInterface(device=self.device, tts_model_path=tts_model_path, vocoder_model_path=vocoder_model_path)
        self.wgan = GanWrapper(embedding_gan_path, num_cached_voices=available_artificial_voices, device=self.device)
        self.generated_speaker_embeds = list()
        self.available_artificial_voices = available_artificial_voices
        self.current_language = ""
        self.current_accent = ""

    def read(self,
             prompt,
             reference_audio,
             voice_seed,
             prosody_creativity,
             loudness_in_db
             ):
        print(prompt + "\n\n")

        if reference_audio is None:
            if not voice_seed:
                self.wgan.set_latent(7)
                controllability_vector = torch.tensor([0.0,
                                                       0.0,
                                                       0.0,
                                                       0.0,
                                                       0.0,
                                                       0.0], dtype=torch.float32)
                embedding = self.wgan.modify_embed(controllability_vector)
                self.model.set_utterance_embedding(embedding=embedding)
            else:
                wavs = list()
                pitch, energy, durations = None, None, None
                for i in range(3, 8):
                    self.wgan.set_latent(i)
                    controllability_vector = torch.tensor([0.0,
                                                           0.0,
                                                           0.0,
                                                           0.0,
                                                           0.0,
                                                           0.0], dtype=torch.float32)
                    embedding = self.wgan.modify_embed(controllability_vector)
                    self.model.set_utterance_embedding(embedding=embedding)
                    wav, sr, pitch, energy, durations = self.model(prompt,
                                                                   input_is_phones=True,
                                                                   duration_scaling_factor=1.0,
                                                                   pitch_variance_scale=1.0,
                                                                   energy_variance_scale=1.0,
                                                                   pause_duration_scaling_factor=1.0,
                                                                   return_plot_as_filepath=False,
                                                                   prosody_creativity=prosody_creativity,
                                                                   loudness_in_db=loudness_in_db,
                                                                   pitch=pitch.unsqueeze(0) if pitch is not None else pitch,
                                                                   energy=energy.unsqueeze(0) if energy is not None else energy,
                                                                   durations=durations.unsqueeze(0) if durations is not None else durations)
                    wavs.append(wav)
                wav = sum(wavs) / len(wavs)
        else:
            self.model.set_utterance_embedding(reference_audio)

        if not voice_seed or reference_audio is not None:
            wav, sr, pitch, energy, durations = self.model(prompt,
                                                           input_is_phones=True,
                                                           duration_scaling_factor=1.0,
                                                           pitch_variance_scale=1.0,
                                                           energy_variance_scale=1.0,
                                                           pause_duration_scaling_factor=1.0,
                                                           return_plot_as_filepath=False,
                                                           prosody_creativity=prosody_creativity,
                                                           loudness_in_db=loudness_in_db)
        return sr, wav