{ "model_type": "Vocoder", "dataset": [ "emilia" ], "preprocess": { "hop_size": 480, "sample_rate": 24000, "max_length": 36000, "n_fft": 1920, "num_mels": 128, "win_size": 1920, "fmin": 0, "fmax": 12000, "mel_var": 8.14, "mel_mean": -4.92, "processed_dir": "", "valid_file": "valid.json", "train_file": "train.json", "use_phone_cond": false, "use_emilia_101k": true }, "model": { "vocos": { "input_channels": 128, "dim": 1024, "intermediate_dim": 4096, "num_layers": 30, "n_fft": 1920, "hop_size": 480, "padding": "same" }, "period_gan": { "max_downsample_channels": 1024, "channels": 64, "channel_increasing_factor": 2 }, "spec_gan": { "stft_params": { "fft_sizes": [ 128, 256, 512, 1024, 2048 ], "hop_sizes": [ 32, 64, 128, 256, 512 ], "win_lengths": [ 128, 256, 512, 1024, 2048 ], "window": "hann_window" }, "in_channels": 1, "out_channels": 1, "channels": 64, "kernel_sizes": [ 5, 3 ], "max_downsample_channels": 1024, "down_scales": [ 2, 2, 2 ], "use_weight_norm": true, "use_complex": false } }, }