Spaces:
Running
Running
File size: 5,168 Bytes
9e275b8 db5766e 9e275b8 70399da 9e275b8 db5766e 9e275b8 db5766e 9e275b8 db5766e 70399da db5766e 9e275b8 70399da 9e275b8 70399da 9e275b8 401875a 70399da 401875a 7d147e2 401875a 987fd27 e50b1be 401875a 987fd27 70399da 9e275b8 701af8c 987fd27 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import os
import torch
from huggingface_hub import hf_hub_download
from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface
from Modules.ControllabilityGAN.GAN import GanWrapper
class ControllableInterface:
def __init__(self, gpu_id="cpu", available_artificial_voices=50, tts_model_path=None, vocoder_model_path=None, embedding_gan_path=None):
if gpu_id == "cpu":
os.environ["CUDA_VISIBLE_DEVICES"] = ""
elif gpu_id == "cuda":
pass
else: # in this case we hopefully got a number.
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = f"{gpu_id}"
if embedding_gan_path is None:
embedding_gan_path = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="embedding_gan.pt")
self.device = "cuda" if gpu_id != "cpu" else "cpu"
self.model = ToucanTTSInterface(device=self.device, tts_model_path=tts_model_path, vocoder_model_path=vocoder_model_path)
self.wgan = GanWrapper(embedding_gan_path, num_cached_voices=available_artificial_voices, device=self.device)
self.generated_speaker_embeds = list()
self.available_artificial_voices = available_artificial_voices
self.current_language = ""
self.current_accent = ""
def read(self,
prompt,
reference_audio,
voice_seed,
prosody_creativity,
loudness_in_db
):
print(prompt + "\n\n")
if reference_audio is None:
if not voice_seed:
self.wgan.set_latent(7)
controllability_vector = torch.tensor([0.0,
0.0,
0.0,
0.0,
0.0,
0.0], dtype=torch.float32)
embedding = self.wgan.modify_embed(controllability_vector)
self.model.set_utterance_embedding(embedding=embedding)
else:
wavs = list()
pitch, energy, durations = None, None, None
for i in range(3, 8):
self.wgan.set_latent(i)
controllability_vector = torch.tensor([0.0,
0.0,
0.0,
0.0,
0.0,
0.0], dtype=torch.float32)
embedding = self.wgan.modify_embed(controllability_vector)
self.model.set_utterance_embedding(embedding=embedding)
wav, sr, pitch, energy, durations = self.model(prompt,
input_is_phones=True,
duration_scaling_factor=1.0,
pitch_variance_scale=1.0,
energy_variance_scale=1.0,
pause_duration_scaling_factor=1.0,
return_plot_as_filepath=False,
prosody_creativity=prosody_creativity,
loudness_in_db=loudness_in_db,
pitch=pitch.unsqueeze(0) if pitch is not None else pitch,
energy=energy.unsqueeze(0) if energy is not None else energy,
durations=durations.unsqueeze(0) if durations is not None else durations)
wavs.append(wav)
wav = sum(wavs) / len(wavs)
else:
self.model.set_utterance_embedding(reference_audio)
if not voice_seed or reference_audio is not None:
wav, sr, pitch, energy, durations = self.model(prompt,
input_is_phones=True,
duration_scaling_factor=1.0,
pitch_variance_scale=1.0,
energy_variance_scale=1.0,
pause_duration_scaling_factor=1.0,
return_plot_as_filepath=False,
prosody_creativity=prosody_creativity,
loudness_in_db=loudness_in_db)
return sr, wav
|