Flux9665 commited on
Commit
987fd27
·
1 Parent(s): f44a589

speed improvements and documentation

Browse files
InferenceInterfaces/ControllableInterface.py CHANGED
@@ -59,31 +59,31 @@ class ControllableInterface:
59
  0.0], dtype=torch.float32)
60
  embedding = self.wgan.modify_embed(controllability_vector)
61
  self.model.set_utterance_embedding(embedding=embedding)
62
- wav, sr, fig, pitch, energy, durations = self.model(prompt,
63
- input_is_phones=True,
64
- duration_scaling_factor=1.0,
65
- pitch_variance_scale=1.0,
66
- energy_variance_scale=1.0,
67
- pause_duration_scaling_factor=1.0,
68
- return_plot_as_filepath=True,
69
- prosody_creativity=prosody_creativity,
70
- loudness_in_db=loudness_in_db,
71
- pitch=pitch,
72
- energy=energy,
73
- durations=durations)
74
  wavs.append(wav)
75
- wav = sum(wavs)/len(wavs)
76
  else:
77
  self.model.set_utterance_embedding(reference_audio)
78
 
79
  if not voice_seed and reference_audio is not None:
80
- wav, sr, fig, pitch, energy, durations = self.model(prompt,
81
- input_is_phones=True,
82
- duration_scaling_factor=1.0,
83
- pitch_variance_scale=1.0,
84
- energy_variance_scale=1.0,
85
- pause_duration_scaling_factor=1.0,
86
- return_plot_as_filepath=True,
87
- prosody_creativity=prosody_creativity,
88
- loudness_in_db=loudness_in_db)
89
- return sr, wav, fig
 
59
  0.0], dtype=torch.float32)
60
  embedding = self.wgan.modify_embed(controllability_vector)
61
  self.model.set_utterance_embedding(embedding=embedding)
62
+ wav, sr, pitch, energy, durations = self.model(prompt,
63
+ input_is_phones=True,
64
+ duration_scaling_factor=1.0,
65
+ pitch_variance_scale=1.0,
66
+ energy_variance_scale=1.0,
67
+ pause_duration_scaling_factor=1.0,
68
+ return_plot_as_filepath=False,
69
+ prosody_creativity=prosody_creativity,
70
+ loudness_in_db=loudness_in_db,
71
+ pitch=pitch,
72
+ energy=energy,
73
+ durations=durations)
74
  wavs.append(wav)
75
+ wav = sum(wavs) / len(wavs)
76
  else:
77
  self.model.set_utterance_embedding(reference_audio)
78
 
79
  if not voice_seed and reference_audio is not None:
80
+ wav, sr, pitch, energy, durations = self.model(prompt,
81
+ input_is_phones=True,
82
+ duration_scaling_factor=1.0,
83
+ pitch_variance_scale=1.0,
84
+ energy_variance_scale=1.0,
85
+ pause_duration_scaling_factor=1.0,
86
+ return_plot_as_filepath=False,
87
+ prosody_creativity=prosody_creativity,
88
+ loudness_in_db=loudness_in_db)
89
+ return sr, wav
InferenceInterfaces/ToucanTTSInterface.py CHANGED
@@ -232,7 +232,7 @@ class ToucanTTSInterface(torch.nn.Module):
232
  plt.savefig("tmp.png")
233
  plt.close()
234
  return wave, sr, "tmp.png", pitch, energy, durations
235
- return wave, sr
236
 
237
  def read_to_file(self,
238
  text_list,
 
232
  plt.savefig("tmp.png")
233
  plt.close()
234
  return wave, sr, "tmp.png", pitch, energy, durations
235
+ return wave, sr, pitch, energy, durations
236
 
237
  def read_to_file(self,
238
  text_list,
app.py CHANGED
@@ -43,8 +43,7 @@ class TTSWebUI:
43
  # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
44
  # gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
45
  ],
46
- outputs=[gr.Audio(type="numpy", label="Speech"),
47
- gr.Image(label="Visualization")],
48
  title=title,
49
  allow_flagging="never",
50
  description=article,
@@ -57,12 +56,12 @@ class TTSWebUI:
57
  voice_seed,
58
  reference_audio,
59
  ):
60
- sr, wav, fig = self.controllable_ui.read(prompt,
61
  reference_audio,
62
  voice_seed,
63
  prosody_creativity,
64
  -24.)
65
- return (sr, float2pcm(wav)), fig
66
 
67
 
68
  if __name__ == '__main__':
 
43
  # gr.Slider(minimum=0.5, maximum=1.5, step=0.1, value=1.0, label="Energy Variance Scale"),
44
  # gr.Slider(minimum=-10.0, maximum=10.0, step=0.1, value=0.0, label="Voice Depth")
45
  ],
46
+ outputs=[gr.Audio(type="numpy", label="Speech")],
 
47
  title=title,
48
  allow_flagging="never",
49
  description=article,
 
56
  voice_seed,
57
  reference_audio,
58
  ):
59
+ sr, wav = self.controllable_ui.read(prompt,
60
  reference_audio,
61
  voice_seed,
62
  prosody_creativity,
63
  -24.)
64
+ return (sr, float2pcm(wav))
65
 
66
 
67
  if __name__ == '__main__':