Spaces:
Running
Running
import os | |
import torch | |
from huggingface_hub import hf_hub_download | |
from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface | |
from Modules.ControllabilityGAN.GAN import GanWrapper | |
class ControllableInterface: | |
def __init__(self, gpu_id="cpu", available_artificial_voices=50, tts_model_path=None, vocoder_model_path=None, embedding_gan_path=None): | |
if gpu_id == "cpu": | |
os.environ["CUDA_VISIBLE_DEVICES"] = "" | |
elif gpu_id == "cuda": | |
pass | |
else: # in this case we hopefully got a number. | |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | |
os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}" | |
if embedding_gan_path is None: | |
embedding_gan_path = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="embedding_gan.pt") | |
self.device = "cuda" if gpu_id != "cpu" else "cpu" | |
self.model = ToucanTTSInterface(device=self.device, tts_model_path=tts_model_path, vocoder_model_path=vocoder_model_path) | |
self.wgan = GanWrapper(embedding_gan_path, num_cached_voices=available_artificial_voices, device=self.device) | |
self.generated_speaker_embeds = list() | |
self.available_artificial_voices = available_artificial_voices | |
self.current_language = "" | |
self.current_accent = "" | |
def read(self, | |
prompt, | |
reference_audio, | |
voice_seed, | |
prosody_creativity, | |
loudness_in_db | |
): | |
print(prompt + "\n\n") | |
if reference_audio is None: | |
if not voice_seed: | |
self.wgan.set_latent(7) | |
controllability_vector = torch.tensor([0.0, | |
0.0, | |
0.0, | |
0.0, | |
0.0, | |
0.0], dtype=torch.float32) | |
embedding = self.wgan.modify_embed(controllability_vector) | |
self.model.set_utterance_embedding(embedding=embedding) | |
else: | |
wavs = list() | |
pitch, energy, durations = None, None, None | |
for i in range(3, 8): | |
self.wgan.set_latent(i) | |
controllability_vector = torch.tensor([0.0, | |
0.0, | |
0.0, | |
0.0, | |
0.0, | |
0.0], dtype=torch.float32) | |
embedding = self.wgan.modify_embed(controllability_vector) | |
self.model.set_utterance_embedding(embedding=embedding) | |
wav, sr, pitch, energy, durations = self.model(prompt, | |
input_is_phones=True, | |
duration_scaling_factor=1.0, | |
pitch_variance_scale=1.0, | |
energy_variance_scale=1.0, | |
pause_duration_scaling_factor=1.0, | |
return_plot_as_filepath=False, | |
prosody_creativity=prosody_creativity, | |
loudness_in_db=loudness_in_db, | |
pitch=pitch.unsqueeze(0) if pitch is not None else pitch, | |
energy=energy.unsqueeze(0) if energy is not None else energy, | |
durations=durations.unsqueeze(0) if durations is not None else durations) | |
wavs.append(wav) | |
wav = sum(wavs) / len(wavs) | |
else: | |
self.model.set_utterance_embedding(reference_audio) | |
if not voice_seed or reference_audio is not None: | |
wav, sr, pitch, energy, durations = self.model(prompt, | |
input_is_phones=True, | |
duration_scaling_factor=1.0, | |
pitch_variance_scale=1.0, | |
energy_variance_scale=1.0, | |
pause_duration_scaling_factor=1.0, | |
return_plot_as_filepath=False, | |
prosody_creativity=prosody_creativity, | |
loudness_in_db=loudness_in_db) | |
return sr, wav | |