farzadab commited on
Commit
20ae7cc
·
verified ·
1 Parent(s): 69c6e87

add <|audio|> token

Browse files
Files changed (1) hide show
  1. config.json +79 -1
config.json CHANGED
@@ -1 +1,79 @@
1
- {"_name_or_path": "/Users/zhuang/repos/ultravox-omni/artifacts/model-zhuang.2025-01-08-v0_5.llama3_2-1b-4a.246352f:v8", "architectures": ["UltravoxModel"], "audio_config": {"_name_or_path": "openai/whisper-large-v3-turbo", "activation_dropout": 0.0, "activation_function": "gelu", "apply_spec_augment": false, "architectures": ["WhisperForConditionalGeneration"], "attention_dropout": 0.0, "begin_suppress_tokens": [220, 50256], "bos_token_id": 50257, "d_model": 1280, "decoder_attention_heads": 20, "decoder_ffn_dim": 5120, "decoder_layerdrop": 0.0, "decoder_layers": 4, "decoder_start_token_id": 50258, "dropout": 0.0, "encoder_attention_heads": 20, "encoder_ffn_dim": 5120, "encoder_layerdrop": 0.0, "encoder_layers": 32, "eos_token_id": 50257, "init_std": 0.02, "is_encoder_decoder": true, "max_source_positions": 1500, "max_target_positions": 448, "median_filter_width": 7, "model_type": "whisper", "num_hidden_layers": 32, "num_mel_bins": 128, "pad_token_id": 50257, "scale_embedding": false, "torch_dtype": "float16", "use_cache": true, "vocab_size": 51866}, "audio_latency_block_size": null, "audio_model_id": null, "auto_map": {"AutoConfig": "ultravox_config.UltravoxConfig", "AutoModel": "ultravox_model.UltravoxModel", "AutoProcessor": "ultravox_processing.UltravoxProcessor"}, "custom_pipelines": {"ultravox-pipeline": {"impl": "ultravox_pipeline.UltravoxPipeline", "pt": ["AutoModel"], "tf": [], "type": "multimodal"}}, "hidden_size": 4096, "ignore_index": -100, "initializer_range": 0.02, "model_type": "ultravox", "norm_init": 0.4, "pad_token_id": 128009, "projector_act": "swiglu", "projector_ln_mid": true, "stack_factor": 8, "text_model_id": "meta-llama/Llama-3.2-1B-Instruct", "torch_dtype": "bfloat16", "transformers_version": "4.48.1", "num_attention_heads": 32, "num_hidden_layers": 16, "vocab_size": 128256, "audio_token_index": 128256}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/Users/zhuang/repos/ultravox-omni/artifacts/model-zhuang.2025-01-08-v0_5.llama3_2-1b-4a.246352f:v8",
3
+ "architectures": [
4
+ "UltravoxModel"
5
+ ],
6
+ "audio_config": {
7
+ "_name_or_path": "openai/whisper-large-v3-turbo",
8
+ "activation_dropout": 0.0,
9
+ "activation_function": "gelu",
10
+ "apply_spec_augment": false,
11
+ "architectures": [
12
+ "WhisperForConditionalGeneration"
13
+ ],
14
+ "attention_dropout": 0.0,
15
+ "begin_suppress_tokens": [
16
+ 220,
17
+ 50256
18
+ ],
19
+ "bos_token_id": 50257,
20
+ "d_model": 1280,
21
+ "decoder_attention_heads": 20,
22
+ "decoder_ffn_dim": 5120,
23
+ "decoder_layerdrop": 0.0,
24
+ "decoder_layers": 4,
25
+ "decoder_start_token_id": 50258,
26
+ "dropout": 0.0,
27
+ "encoder_attention_heads": 20,
28
+ "encoder_ffn_dim": 5120,
29
+ "encoder_layerdrop": 0.0,
30
+ "encoder_layers": 32,
31
+ "eos_token_id": 50257,
32
+ "init_std": 0.02,
33
+ "is_encoder_decoder": true,
34
+ "max_source_positions": 1500,
35
+ "max_target_positions": 448,
36
+ "median_filter_width": 7,
37
+ "model_type": "whisper",
38
+ "num_hidden_layers": 32,
39
+ "num_mel_bins": 128,
40
+ "pad_token_id": 50257,
41
+ "scale_embedding": false,
42
+ "torch_dtype": "float16",
43
+ "use_cache": true,
44
+ "vocab_size": 51866
45
+ },
46
+ "audio_latency_block_size": null,
47
+ "audio_model_id": null,
48
+ "auto_map": {
49
+ "AutoConfig": "ultravox_config.UltravoxConfig",
50
+ "AutoModel": "ultravox_model.UltravoxModel",
51
+ "AutoProcessor": "ultravox_processing.UltravoxProcessor"
52
+ },
53
+ "custom_pipelines": {
54
+ "ultravox-pipeline": {
55
+ "impl": "ultravox_pipeline.UltravoxPipeline",
56
+ "pt": [
57
+ "AutoModel"
58
+ ],
59
+ "tf": [],
60
+ "type": "multimodal"
61
+ }
62
+ },
63
+ "hidden_size": 4096,
64
+ "ignore_index": -100,
65
+ "initializer_range": 0.02,
66
+ "model_type": "ultravox",
67
+ "norm_init": 0.4,
68
+ "pad_token_id": 128009,
69
+ "projector_act": "swiglu",
70
+ "projector_ln_mid": true,
71
+ "stack_factor": 8,
72
+ "text_model_id": "meta-llama/Llama-3.2-1B-Instruct",
73
+ "torch_dtype": "bfloat16",
74
+ "transformers_version": "4.48.1",
75
+ "num_attention_heads": 32,
76
+ "num_hidden_layers": 16,
77
+ "vocab_size": 128256,
78
+ "audio_token_index": 128256
79
+ }