import os import torch from huggingface_hub import hf_hub_download from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface from Modules.ControllabilityGAN.GAN import GanWrapper class ControllableInterface: def __init__(self, gpu_id="cpu", available_artificial_voices=50, tts_model_path=None, vocoder_model_path=None, embedding_gan_path=None): if gpu_id == "cpu": os.environ["CUDA_VISIBLE_DEVICES"] = "" elif gpu_id == "cuda": pass else: # in this case we hopefully got a number. os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" if embedding_gan_path is None: embedding_gan_path = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="embedding_gan.pt") self.device = "cuda" if gpu_id != "cpu" else "cpu" self.model = ToucanTTSInterface(device=self.device, tts_model_path=tts_model_path, vocoder_model_path=vocoder_model_path) self.wgan = GanWrapper(embedding_gan_path, num_cached_voices=available_artificial_voices, device=self.device) self.generated_speaker_embeds = list() self.available_artificial_voices = available_artificial_voices self.current_language = "" self.current_accent = "" def read(self, prompt, reference_audio, voice_seed, prosody_creativity, loudness_in_db ): print(prompt + "\n\n") if reference_audio is None: if not voice_seed: self.wgan.set_latent(7) controllability_vector = torch.tensor([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], dtype=torch.float32) embedding = self.wgan.modify_embed(controllability_vector) self.model.set_utterance_embedding(embedding=embedding) else: wavs = list() pitch, energy, durations = None, None, None for i in range(3, 8): self.wgan.set_latent(i) controllability_vector = torch.tensor([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], dtype=torch.float32) embedding = self.wgan.modify_embed(controllability_vector) self.model.set_utterance_embedding(embedding=embedding) wav, sr, pitch, energy, durations = self.model(prompt, input_is_phones=True, duration_scaling_factor=1.0, pitch_variance_scale=1.0, energy_variance_scale=1.0, pause_duration_scaling_factor=1.0, return_plot_as_filepath=False, prosody_creativity=prosody_creativity, loudness_in_db=loudness_in_db, pitch=pitch.unsqueeze(0) if pitch is not None else pitch, energy=energy.unsqueeze(0) if energy is not None else energy, durations=durations.unsqueeze(0) if durations is not None else durations) wavs.append(wav) wav = sum(wavs) / len(wavs) else: self.model.set_utterance_embedding(reference_audio) if not voice_seed or reference_audio is not None: wav, sr, pitch, energy, durations = self.model(prompt, input_is_phones=True, duration_scaling_factor=1.0, pitch_variance_scale=1.0, energy_variance_scale=1.0, pause_duration_scaling_factor=1.0, return_plot_as_filepath=False, prosody_creativity=prosody_creativity, loudness_in_db=loudness_in_db) return sr, wav