werning commited on
Commit
c40d3e9
·
1 Parent(s): cbdb41f

Add config files

Browse files
models/d_vector_model/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "factory": "padertorch.contrib.tcl.speaker_embeddings.dvectors.DVectorModel",
3
+ "loss": {
4
+ "eps": 1e-07,
5
+ "factory": "padertorch.contrib.tcl.speaker_embeddings.loss.AngularPenaltySMLoss",
6
+ "in_features": 256,
7
+ "loss_type": "aam",
8
+ "m": null,
9
+ "out_features": 7196,
10
+ "reduce": "mean",
11
+ "s": null
12
+ },
13
+ "sampling_rate": 16000,
14
+ "speaker_net": {
15
+ "activation_fn": "relu",
16
+ "channels": [
17
+ 64,
18
+ 128,
19
+ 256,
20
+ 256
21
+ ],
22
+ "dvec_dim": 256,
23
+ "factory": "padertorch.contrib.tcl.speaker_embeddings.dvectors.ResNet34",
24
+ "in_channels": 1,
25
+ "norm": "batch",
26
+ "pre_activation": true
27
+ }
28
+ }
models/norm_flow/config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "factory": "pvq_manipulation.models.ffjord.FFJORD",
3
+ "normalize": true,
4
+ "ode_function": {
5
+ "condition_dim": 7,
6
+ "factory": "pvq_manipulation.models.ode_functions.CNFNN",
7
+ "hidden_channels": [
8
+ 512
9
+ ],
10
+ "input_dim": 256
11
+ }
12
+ }
models/norm_flow/speaker_conditioning.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "Weight",
3
+ "Resonance",
4
+ "Breathiness",
5
+ "Roughness",
6
+ "Loudness",
7
+ "Strain",
8
+ "Pitch"
9
+ ]
models/tts_model/config.json ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "/net/vol/rautenberg/storage/acoustic_pitch/libritts_r_16",
3
+ "logger_uri": null,
4
+ "run_name": "acoustic_pitch",
5
+ "project_name": "YourTTS",
6
+ "run_description": "\n - Original YourTTS trained using VCTK dataset\n ",
7
+ "print_step": 50,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "save_on_interrupt": true,
13
+ "log_model_step": 1000,
14
+ "save_step": 5000,
15
+ "save_n_checkpoints": 2,
16
+ "save_checkpoints": true,
17
+ "save_all_best": false,
18
+ "save_best_after": 10000,
19
+ "target_loss": "loss_1",
20
+ "print_eval": false,
21
+ "test_delay_epochs": 0,
22
+ "run_eval": true,
23
+ "run_eval_steps": null,
24
+ "distributed_backend": "nccl",
25
+ "distributed_url": "tcp://localhost:54321",
26
+ "mixed_precision": false,
27
+ "precision": "fp16",
28
+ "epochs": 1000,
29
+ "batch_size": 50,
30
+ "eval_batch_size": 50,
31
+ "grad_clip": [
32
+ 1000,
33
+ 1000,
34
+ 1000
35
+ ],
36
+ "scheduler_after_epoch": true,
37
+ "lr": 0.001,
38
+ "optimizer": "AdamW",
39
+ "optimizer_params": {
40
+ "betas": [
41
+ 0.8,
42
+ 0.99
43
+ ],
44
+ "eps": 1e-09,
45
+ "weight_decay": 0.01
46
+ },
47
+ "lr_scheduler": null,
48
+ "lr_scheduler_params": {},
49
+ "use_grad_scaler": false,
50
+ "allow_tf32": false,
51
+ "cudnn_enable": true,
52
+ "cudnn_deterministic": false,
53
+ "cudnn_benchmark": false,
54
+ "training_seed": 54321,
55
+ "model": "vits",
56
+ "num_loader_workers": 8,
57
+ "num_eval_loader_workers": 0,
58
+ "use_noise_augment": false,
59
+ "audio": {
60
+ "fft_size": 1024,
61
+ "sample_rate": 24000,
62
+ "win_length": 1024,
63
+ "hop_length": 256,
64
+ "num_mels": 80,
65
+ "mel_fmin": 0.0,
66
+ "mel_fmax": null,
67
+ "fading": "half",
68
+ "window": "hann"
69
+ },
70
+ "use_phonemes": false,
71
+ "phonemizer": "espeak",
72
+ "phoneme_language": "en",
73
+ "compute_input_seq_cache": true,
74
+ "text_cleaner": "multilingual_cleaners",
75
+ "enable_eos_bos_chars": false,
76
+ "test_sentences_file": "",
77
+ "phoneme_cache_path": "None",
78
+ "characters": {
79
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
80
+ "vocab_dict": null,
81
+ "pad": "_",
82
+ "eos": "&",
83
+ "bos": "*",
84
+ "blank": null,
85
+ "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;?\u00af\u2013\u00fc\u00f6\u00e4\u00df\u201a\u2018\u2019",
86
+ "punctuations": "!'(),-.:;? ",
87
+ "phonemes": "",
88
+ "is_unique": true,
89
+ "is_sorted": true
90
+ },
91
+ "add_blank": true,
92
+ "batch_group_size": 48,
93
+ "loss_masking": null,
94
+ "min_audio_len": 1,
95
+ "max_audio_len": 240000,
96
+ "min_text_len": 1,
97
+ "max_text_len": Infinity,
98
+ "compute_f0": false,
99
+ "compute_energy": false,
100
+ "compute_linear_spec": true,
101
+ "precompute_num_workers": 12,
102
+ "start_by_longest": true,
103
+ "shuffle": false,
104
+ "drop_last": false,
105
+ "datasets": null,
106
+ "test_sentences": [
107
+ [
108
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
109
+ "/net/vol/rautenberg/storage/Dataset/librittsr/LibriTTS_R/test-clean/1089/134686/1089_134686_000001_000001.wav",
110
+ "1089",
111
+ "1089_134686_000001_000001"
112
+ ],
113
+ [
114
+ "Be a voice, not an echo.",
115
+ "/net/vol/rautenberg/storage/Dataset/librittsr/LibriTTS_R/test-clean/1089/134691/1089_134691_000004_000001.wav",
116
+ "1089",
117
+ "1089_134691_000004_000001"
118
+ ],
119
+ [
120
+ "I'm sorry Dave. I'm afraid I can't do that.",
121
+ "/net/vol/rautenberg/storage/Dataset/librittsr/LibriTTS_R/test-clean/121/127105/121_127105_000024_000000.wav",
122
+ "121",
123
+ "121_127105_000024_000000"
124
+ ],
125
+ [
126
+ "This cake is great. It's so delicious and moist.",
127
+ "/net/vol/rautenberg/storage/Dataset/librittsr/LibriTTS_R/test-clean/1284/1181/1284_1181_000005_000000.wav",
128
+ "1284",
129
+ "1284_1181_000005_000000"
130
+ ]
131
+ ],
132
+ "eval_split_max_size": 256,
133
+ "eval_split_size": 0.01,
134
+ "use_speaker_weighted_sampler": false,
135
+ "speaker_weighted_sampler_alpha": 1.0,
136
+ "use_language_weighted_sampler": false,
137
+ "language_weighted_sampler_alpha": 1.0,
138
+ "use_length_weighted_sampler": false,
139
+ "length_weighted_sampler_alpha": 1.0,
140
+ "model_args": {
141
+ "num_chars": 84,
142
+ "out_channels": 513,
143
+ "spec_segment_size": 64,
144
+ "hidden_channels": 192,
145
+ "hidden_channels_ffn_text_encoder": 768,
146
+ "num_heads_text_encoder": 2,
147
+ "num_layers_text_encoder": 10,
148
+ "kernel_size_text_encoder": 3,
149
+ "dropout_p_text_encoder": 0.1,
150
+ "dropout_p_duration_predictor": 0.5,
151
+ "kernel_size_posterior_encoder": 5,
152
+ "dilation_rate_posterior_encoder": 1,
153
+ "num_layers_posterior_encoder": 16,
154
+ "kernel_size_flow": 5,
155
+ "dilation_rate_flow": 1,
156
+ "num_layers_flow": 4,
157
+ "resblock_type_decoder": "1",
158
+ "resblock_kernel_sizes_decoder": [
159
+ 3,
160
+ 7,
161
+ 11
162
+ ],
163
+ "resblock_dilation_sizes_decoder": [
164
+ [
165
+ 1,
166
+ 3,
167
+ 5
168
+ ],
169
+ [
170
+ 1,
171
+ 3,
172
+ 5
173
+ ],
174
+ [
175
+ 1,
176
+ 3,
177
+ 5
178
+ ]
179
+ ],
180
+ "upsample_rates_decoder": [
181
+ 8,
182
+ 8,
183
+ 2,
184
+ 2
185
+ ],
186
+ "upsample_initial_channel_decoder": 512,
187
+ "upsample_kernel_sizes_decoder": [
188
+ 16,
189
+ 16,
190
+ 4,
191
+ 4
192
+ ],
193
+ "periods_multi_period_discriminator": [
194
+ 2,
195
+ 3,
196
+ 5,
197
+ 7,
198
+ 11
199
+ ],
200
+ "use_sdp": false,
201
+ "noise_scale": 1.0,
202
+ "inference_noise_scale": 0.667,
203
+ "length_scale": 1,
204
+ "noise_scale_dp": 1.0,
205
+ "inference_noise_scale_dp": 1.0,
206
+ "max_inference_len": null,
207
+ "init_discriminator": true,
208
+ "use_spectral_norm_disriminator": false,
209
+ "use_speaker_embedding": false,
210
+ "num_speakers": 0,
211
+ "speakers_file": "/net/vol/rautenberg/storage/acoustic_pitch/libritts_r_16/acoustic_pitch-July-15-2024_04+08PM-1471536/speakers.pth",
212
+ "d_vector_file": null,
213
+ "speaker_embedding_channels": 256,
214
+ "use_d_vector_file": true,
215
+ "d_vector_dim": 256,
216
+ "detach_dp_input": true,
217
+ "use_language_embedding": false,
218
+ "embedded_language_dim": 4,
219
+ "num_languages": 0,
220
+ "language_ids_file": null,
221
+ "use_speaker_encoder_as_loss": false,
222
+ "speaker_encoder_config_path": "./models/d_vector_model/config_se.json",
223
+ "condition_dp_on_speaker": true,
224
+ "freeze_encoder": false,
225
+ "freeze_DP": false,
226
+ "freeze_PE": false,
227
+ "freeze_flow_decoder": false,
228
+ "freeze_waveform_decoder": false,
229
+ "encoder_sample_rate": null,
230
+ "interpolate_z": true,
231
+ "reinit_DP": false,
232
+ "reinit_text_encoder": false
233
+ },
234
+ "lr_gen": 0.0002,
235
+ "lr_disc": 0.0002,
236
+ "lr_scheduler_gen": "ExponentialLR",
237
+ "lr_scheduler_gen_params": {
238
+ "gamma": 0.999875,
239
+ "last_epoch": -1
240
+ },
241
+ "lr_scheduler_disc": "ExponentialLR",
242
+ "lr_scheduler_disc_params": {
243
+ "gamma": 0.999875,
244
+ "last_epoch": -1
245
+ },
246
+ "kl_loss_alpha": 1.0,
247
+ "disc_loss_alpha": 1.0,
248
+ "gen_loss_alpha": 1,
249
+ "feat_loss_alpha": 1.0,
250
+ "mel_loss_alpha": 45.0,
251
+ "dur_loss_alpha": 1.0,
252
+ "speaker_encoder_loss_alpha": 9.0,
253
+ "return_wav": true,
254
+ "use_weighted_sampler": true,
255
+ "weighted_sampler_attrs": {
256
+ "speaker_name": 1.0
257
+ },
258
+ "weighted_sampler_multipliers": {},
259
+ "r": 1,
260
+ "num_speakers": 0,
261
+ "use_speaker_embedding": false,
262
+ "speakers_file": "/net/vol/rautenberg/storage/acoustic_pitch/libritts_r_16/acoustic_pitch-July-15-2024_04+08PM-1471536/speakers.pth",
263
+ "speaker_embedding_channels": 256,
264
+ "language_ids_file": null,
265
+ "use_language_embedding": false,
266
+ "d_vectors_stor_file": false,
267
+ "d_vector_model_file": "./models/d_vector_model/",
268
+ "d_vector_dim": 256,
269
+ "d_vector_model": "nt_model",
270
+ "dataset_dict": {
271
+ "datasets": {
272
+ "libritts_r": {
273
+ "dataset_storage_path": "/net/vol/rautenberg/storage/Dataset/librittsr/",
274
+ "d_vector_storage_file": "/net/vol/rautenberg/storage/Dataset/d_vectors_libritts_r/nt",
275
+ "phone_alignement_path": "Phone_alignement/libri_tts_aligned/",
276
+ "hubert_features_storage_file": "/net/vol/rautenberg/storage/Dataset/hubert_libritts_r/nt",
277
+ "creak_labels_storage_file": "/net/vol/rautenberg/storage/Dataset/creak_labels/libritts_r/mean"
278
+ }
279
+ },
280
+ "test_sentences": [
281
+ {
282
+ "text": "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
283
+ "audio_path": "/net/vol/rautenberg/storage/Dataset/librittsr/LibriTTS_R/test-clean/1089/134686/1089_134686_000001_000001.wav",
284
+ "speaker_id": "1089",
285
+ "example_id": "1089_134686_000001_000001",
286
+ "manipulation_times": [
287
+ 0.4,
288
+ 0.7,
289
+ 1
290
+ ],
291
+ "manipulation_factors": [
292
+ 0,
293
+ 0.5,
294
+ 1
295
+ ],
296
+ "manipulation_idx": [
297
+ 0
298
+ ],
299
+ "dataset_name": "libritts_r"
300
+ },
301
+ {
302
+ "text": "Be a voice, not an echo.",
303
+ "audio_path": "/net/vol/rautenberg/storage/Dataset/librittsr/LibriTTS_R/test-clean/1089/134691/1089_134691_000004_000001.wav",
304
+ "speaker_id": "1089",
305
+ "example_id": "1089_134691_000004_000001",
306
+ "manipulation_times": [
307
+ 0.4,
308
+ 0.7,
309
+ 1
310
+ ],
311
+ "manipulation_factors": [
312
+ 0,
313
+ 0.5,
314
+ 1
315
+ ],
316
+ "manipulation_idx": [
317
+ 0
318
+ ],
319
+ "dataset_name": "libritts_r"
320
+ },
321
+ {
322
+ "text": "I'm sorry Dave. I'm afraid I can't do that.",
323
+ "audio_path": "/net/vol/rautenberg/storage/Dataset/librittsr/LibriTTS_R/test-clean/121/127105/121_127105_000024_000000.wav",
324
+ "speaker_id": "121",
325
+ "example_id": "121_127105_000024_000000",
326
+ "manipulation_times": [
327
+ 0.4,
328
+ 0.7,
329
+ 1
330
+ ],
331
+ "manipulation_factors": [
332
+ 0,
333
+ 0.5,
334
+ 1
335
+ ],
336
+ "manipulation_idx": [
337
+ 0
338
+ ],
339
+ "dataset_name": "libritts_r"
340
+ },
341
+ {
342
+ "text": "This cake is great. It's so delicious and moist.",
343
+ "audio_path": "/net/vol/rautenberg/storage/Dataset/librittsr/LibriTTS_R/test-clean/1284/1181/1284_1181_000005_000000.wav",
344
+ "speaker_id": "1284",
345
+ "example_id": "1284_1181_000005_000000",
346
+ "manipulation_times": [
347
+ 0.4,
348
+ 0.7,
349
+ 1
350
+ ],
351
+ "manipulation_factors": [
352
+ 0,
353
+ 0.5,
354
+ 1
355
+ ],
356
+ "manipulation_idx": [
357
+ 0
358
+ ],
359
+ "dataset_name": "libritts_r"
360
+ }
361
+ ]
362
+ },
363
+ "sample_rate": 24000,
364
+ "use_vad": false,
365
+ "CONFIG_SOLVER": "",
366
+ "use_speaker_embedding_cond": true
367
+ }