Flux9665 commited on
Commit
f44a589
·
1 Parent(s): de41fc6

speed improvements and documentation

Browse files
InferenceInterfaces/ControllableInterface.py CHANGED
@@ -72,7 +72,7 @@ class ControllableInterface:
72
  energy=energy,
73
  durations=durations)
74
  wavs.append(wav)
75
- wav = wavs.mean()
76
  else:
77
  self.model.set_utterance_embedding(reference_audio)
78
 
 
72
  energy=energy,
73
  durations=durations)
74
  wavs.append(wav)
75
+ wav = sum(wavs)/len(wavs)
76
  else:
77
  self.model.set_utterance_embedding(reference_audio)
78
 
Modules/GeneralLayers/Conformer.py CHANGED
@@ -136,7 +136,7 @@ class Conformer(torch.nn.Module):
136
  projected_lang_embs = self.language_embedding_projection(lang_embs).unsqueeze(-1).transpose(1, 2)
137
  projected_lang_embs = self.language_emb_norm(projected_lang_embs)
138
  proj_lang_embs_s.append(projected_lang_embs)
139
- xs = xs + proj_lang_embs_s.mean() # offset phoneme representation by language specific offset
140
 
141
  xs = self.pos_enc(xs)
142
 
 
136
  projected_lang_embs = self.language_embedding_projection(lang_embs).unsqueeze(-1).transpose(1, 2)
137
  projected_lang_embs = self.language_emb_norm(projected_lang_embs)
138
  proj_lang_embs_s.append(projected_lang_embs)
139
+ xs = xs + (sum(proj_lang_embs_s)/len(proj_lang_embs_s)) # offset phoneme representation by language specific offset
140
 
141
  xs = self.pos_enc(xs)
142