{
    "model_type": "AutoregressiveTransformer",
    "dataset": [
        "emilia",
    ],
    "preprocess": {
        "hop_size": 320,
        "sample_rate": 16000, // HuBERT, WavLM, are both 16000
        "n_fft": 1920,
        "num_mels": 128,
        "win_size": 1920,
        "fmin": 0,
        "fmax": 12000,
        "mel_var": 8.14,
        "mel_mean": -4.92,
        "processed_dir": "",
        "valid_file": "valid.json",
        "train_file": "train.json",
        "min_dur": 3,
        "max_dur": 30,
        "load_phone": true,
    },
    "model": {
        "autoregressive_transformer": {
            "input_vocab_size": 1056,
            "output_vocab_size": 8192,
            "hidden_size": 1536,
            "intermediate_size": 6144,
            "num_hidden_layers": 12,
            "num_attention_heads": 16,
            "use_global_style_encoder": false
        },
        "train_both_vc_and_tts": true,
        "vc_input_token_type": "hubert_vevo_codec",
        "vc_input_vocab_size": 32,
        "tts_input_token_type": "g2p",
        "tts_input_vocab_size": 1024,
        "output_token_type": "hubert_codec",
        "representation_stat_mean_var_path": "./models/vc/vevo/config/hubert_large_l18_mean_std.npz",
        "input_repcodec": {
            "config_path": "./models/vc/vevo/config/hubert_large_l18_c32.yaml",
        },
        "output_repcodec": {
            "codebook_size": 8192, // VQ Codebook Size
            "hidden_size": 1024, // Representations Dim
            "codebook_dim": 8,
            "vocos_dim": 384,
            "vocos_intermediate_dim": 2048,
            "vocos_num_layers": 12,
        }
    },
}