Spaces:

amphion
/

Text-to-Speech

Running

Text-to-Speech / config /audioldm.json

Upload 685 files

0d80816 over 1 year ago

1.81 kB

	{
	"base_config": "config/base.json",
	"model_type": "AudioLDM",
	"task_type": "tta",
	"dataset": [
	"AudioCaps"
	],
	"preprocess": {
	// feature used for model training
	"use_spkid": false,
	"use_uv": false,
	"use_frame_pitch": false,
	"use_phone_pitch": false,
	"use_frame_energy": false,
	"use_phone_energy": false,
	"use_mel": false,
	"use_audio": false,
	"use_label": false,
	"use_one_hot": false,
	"cond_mask_prob": 0.1
	},
	// model
	"model": {
	"audioldm": {
	"image_size": 32,
	"in_channels": 4,
	"out_channels": 4,
	"model_channels": 256,
	"attention_resolutions": [
	4,
	2,
	1
	],
	"num_res_blocks": 2,
	"channel_mult": [
	1,
	2,
	4
	],
	"num_heads": 8,
	"use_spatial_transformer": true,
	"transformer_depth": 1,
	"context_dim": 768,
	"use_checkpoint": true,
	"legacy": false
	},
	"autoencoderkl": {
	"ch": 128,
	"ch_mult": [
	1,
	1,
	2,
	2,
	4
	],
	"num_res_blocks": 2,
	"in_channels": 1,
	"z_channels": 4,
	"out_ch": 1,
	"double_z": true
	},
	"noise_scheduler": {
	"num_train_timesteps": 1000,
	"beta_start": 0.00085,
	"beta_end": 0.012,
	"beta_schedule": "scaled_linear",
	"clip_sample": false,
	"steps_offset": 1,
	"set_alpha_to_one": false,
	"skip_prk_steps": true,
	"prediction_type": "epsilon"
	}
	},
	// train
	"train": {
	"lronPlateau": {
	"factor": 0.9,
	"patience": 100,
	"min_lr": 4.0e-5,
	"verbose": true
	},
	"adam": {
	"lr": 5.0e-5,
	"betas": [
	0.9,
	0.999
	],
	"weight_decay": 1.0e-2,
	"eps": 1.0e-8
	}
	}
	}