Spaces:
Running
Running
| { | |
| "base_config": "config/base.json", | |
| "model_type": "AudioLDM", | |
| "task_type": "tta", | |
| "dataset": [ | |
| "AudioCaps" | |
| ], | |
| "preprocess": { | |
| // feature used for model training | |
| "use_spkid": false, | |
| "use_uv": false, | |
| "use_frame_pitch": false, | |
| "use_phone_pitch": false, | |
| "use_frame_energy": false, | |
| "use_phone_energy": false, | |
| "use_mel": false, | |
| "use_audio": false, | |
| "use_label": false, | |
| "use_one_hot": false, | |
| "cond_mask_prob": 0.1 | |
| }, | |
| // model | |
| "model": { | |
| "audioldm": { | |
| "image_size": 32, | |
| "in_channels": 4, | |
| "out_channels": 4, | |
| "model_channels": 256, | |
| "attention_resolutions": [ | |
| 4, | |
| 2, | |
| 1 | |
| ], | |
| "num_res_blocks": 2, | |
| "channel_mult": [ | |
| 1, | |
| 2, | |
| 4 | |
| ], | |
| "num_heads": 8, | |
| "use_spatial_transformer": true, | |
| "transformer_depth": 1, | |
| "context_dim": 768, | |
| "use_checkpoint": true, | |
| "legacy": false | |
| }, | |
| "autoencoderkl": { | |
| "ch": 128, | |
| "ch_mult": [ | |
| 1, | |
| 1, | |
| 2, | |
| 2, | |
| 4 | |
| ], | |
| "num_res_blocks": 2, | |
| "in_channels": 1, | |
| "z_channels": 4, | |
| "out_ch": 1, | |
| "double_z": true | |
| }, | |
| "noise_scheduler": { | |
| "num_train_timesteps": 1000, | |
| "beta_start": 0.00085, | |
| "beta_end": 0.012, | |
| "beta_schedule": "scaled_linear", | |
| "clip_sample": false, | |
| "steps_offset": 1, | |
| "set_alpha_to_one": false, | |
| "skip_prk_steps": true, | |
| "prediction_type": "epsilon" | |
| } | |
| }, | |
| // train | |
| "train": { | |
| "lronPlateau": { | |
| "factor": 0.9, | |
| "patience": 100, | |
| "min_lr": 4.0e-5, | |
| "verbose": true | |
| }, | |
| "adam": { | |
| "lr": 5.0e-5, | |
| "betas": [ | |
| 0.9, | |
| 0.999 | |
| ], | |
| "weight_decay": 1.0e-2, | |
| "eps": 1.0e-8 | |
| } | |
| } | |
| } |