NeutralToucan

Running

File size: 5,168 Bytes

import os
import torch
from huggingface_hub import hf_hub_download

from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
from Modules.ControllabilityGAN.GAN import GanWrapper


class ControllableInterface:

    def __init__(self, gpu_id="cpu", available_artificial_voices=50, tts_model_path=None, vocoder_model_path=None, embedding_gan_path=None):
        if gpu_id == "cpu":
            os.environ["CUDA_VISIBLE_DEVICES"] = ""
        elif gpu_id == "cuda":
            pass
        else:  # in this case we hopefully got a number.
            os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
            os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
        if embedding_gan_path is None:
            embedding_gan_path = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="embedding_gan.pt")
        self.device = "cuda" if gpu_id != "cpu" else "cpu"
        self.model = ToucanTTSInterface(device=self.device, tts_model_path=tts_model_path, vocoder_model_path=vocoder_model_path)
        self.wgan = GanWrapper(embedding_gan_path, num_cached_voices=available_artificial_voices, device=self.device)
        self.generated_speaker_embeds = list()
        self.available_artificial_voices = available_artificial_voices
        self.current_language = ""
        self.current_accent = ""

    def read(self,
             prompt,
             reference_audio,
             voice_seed,
             prosody_creativity,
             loudness_in_db
             ):
        print(prompt + "\n\n")

        if reference_audio is None:
            if not voice_seed:
                self.wgan.set_latent(7)
                controllability_vector = torch.tensor([0.0,
                                                       0.0,
                                                       0.0,
                                                       0.0,
                                                       0.0,
                                                       0.0], dtype=torch.float32)
                embedding = self.wgan.modify_embed(controllability_vector)
                self.model.set_utterance_embedding(embedding=embedding)
            else:
                wavs = list()
                pitch, energy, durations = None, None, None
                for i in range(3, 8):
                    self.wgan.set_latent(i)
                    controllability_vector = torch.tensor([0.0,
                                                           0.0,
                                                           0.0,
                                                           0.0,
                                                           0.0,
                                                           0.0], dtype=torch.float32)
                    embedding = self.wgan.modify_embed(controllability_vector)
                    self.model.set_utterance_embedding(embedding=embedding)
                    wav, sr, pitch, energy, durations = self.model(prompt,
                                                                   input_is_phones=True,
                                                                   duration_scaling_factor=1.0,
                                                                   pitch_variance_scale=1.0,
                                                                   energy_variance_scale=1.0,
                                                                   pause_duration_scaling_factor=1.0,
                                                                   return_plot_as_filepath=False,
                                                                   prosody_creativity=prosody_creativity,
                                                                   loudness_in_db=loudness_in_db,
                                                                   pitch=pitch.unsqueeze(0) if pitch is not None else pitch,
                                                                   energy=energy.unsqueeze(0) if energy is not None else energy,
                                                                   durations=durations.unsqueeze(0) if durations is not None else durations)
                    wavs.append(wav)
                wav = sum(wavs) / len(wavs)
        else:
            self.model.set_utterance_embedding(reference_audio)

        if not voice_seed or reference_audio is not None:
            wav, sr, pitch, energy, durations = self.model(prompt,
                                                           input_is_phones=True,
                                                           duration_scaling_factor=1.0,
                                                           pitch_variance_scale=1.0,
                                                           energy_variance_scale=1.0,
                                                           pause_duration_scaling_factor=1.0,
                                                           return_plot_as_filepath=False,
                                                           prosody_creativity=prosody_creativity,
                                                           loudness_in_db=loudness_in_db)
        return sr, wav