File size: 8,401 Bytes
0602132 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
"""
DeepSeek model configuration
"""
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
class DeepSeekConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DeepSeekModel`]. It is used to instantiate a
DeepSeek model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the DeepSeek-V3
[deepseek-ai/DeepSeek-V3](https://huggingface.co/deepseek-ai/DeepSeek-V3) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 50256):
Vocabulary size of the DeepSeek model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`DeepSeekModel`]
hidden_size (`int`, *optional*, defaults to 1024):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 4096):
Dimension of the MLP representations for dense layers.
moe_intermediate_size (`int`, *optional*, defaults to 704):
Dimension of the MLP representations for MoE layers.
num_hidden_layers (`int`, *optional*, defaults to 6):
Number of hidden layers in the Transformer decoder.
num_dense_layers (`int`, *optional*, defaults to 1):
Number of dense (non-MoE) layers in the model.
num_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer decoder.
num_routed_experts (`int`, *optional*, defaults to 4):
Number of routed experts in MoE layers.
num_shared_experts (`int`, *optional*, defaults to 2):
Number of shared experts in MoE layers.
num_activated_experts (`int`, *optional*, defaults to 2):
Number of experts activated per token in MoE layers.
num_expert_groups (`int`, *optional*, defaults to 1):
Number of expert groups in MoE layers.
num_limited_groups (`int`, *optional*, defaults to 1):
Number of limited groups in MoE layers.
score_func (`str`, *optional*, defaults to `"softmax"`):
Scoring function for expert selection. Can be "softmax" or "sigmoid".
route_scale (`float`, *optional*, defaults to 1.0):
Scaling factor for routing weights.
q_lora_rank (`int`, *optional*, defaults to 0):
Rank of LoRA adaptation for query projection. 0 means no LoRA.
kv_lora_rank (`int`, *optional*, defaults to 256):
Rank of LoRA adaptation for key-value projection.
qk_nope_head_dim (`int`, *optional*, defaults to 64):
Dimension of query-key heads without positional encoding.
qk_rope_head_dim (`int`, *optional*, defaults to 32):
Dimension of query-key heads with rotary positional encoding.
v_head_dim (`int`, *optional*, defaults to 64):
Dimension of value heads.
original_seq_len (`int`, *optional*, defaults to 512):
Original sequence length used during pretraining.
rope_theta (`float`, *optional*, defaults to 10000.0):
Base frequency for rotary positional encoding.
rope_factor (`float`, *optional*, defaults to 40):
Scaling factor for RoPE frequency adjustment.
beta_fast (`int`, *optional*, defaults to 32):
Fast beta parameter for YaRN RoPE scaling.
beta_slow (`int`, *optional*, defaults to 1):
Slow beta parameter for YaRN RoPE scaling.
mscale (`float`, *optional*, defaults to 1.0):
Scale factor for attention logits when using extended context.
max_position_embeddings (`int`, *optional*, defaults to 256):
The maximum sequence length that this model might ever be used with.
max_batch_size (`int`, *optional*, defaults to 2):
The maximum batch size that this model might ever be used with for caching.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-3):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*):
The id of the padding token.
bos_token_id (`int`, *optional*, defaults to 2):
The id of the "beginning-of-sequence" token.
eos_token_id (`int`, *optional*, defaults to 3):
The id of the "end-of-sequence" token.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
```python
>>> from transformers import DeepSeekModel, DeepSeekConfig
>>> # Initializing a DeepSeek configuration
>>> configuration = DeepSeekConfig()
>>> # Initializing a model from the configuration
>>> model = DeepSeekModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "deepseek"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=50256,
hidden_size=1024,
intermediate_size=4096,
moe_intermediate_size=704,
num_hidden_layers=6,
num_dense_layers=1,
num_attention_heads=8,
num_routed_experts=4,
num_shared_experts=2,
num_activated_experts=2,
num_expert_groups=1,
num_limited_groups=1,
score_func="softmax",
route_scale=1.0,
q_lora_rank=0,
kv_lora_rank=256,
qk_nope_head_dim=64,
qk_rope_head_dim=32,
v_head_dim=64,
original_seq_len=512,
rope_theta=10000.0,
rope_factor=40,
beta_fast=32,
beta_slow=1,
mscale=1.0,
max_position_embeddings=256,
max_batch_size=2,
initializer_range=0.02,
rms_norm_eps=1e-3,
use_cache=True,
pad_token_id=0,
bos_token_id=2,
eos_token_id=3,
tie_word_embeddings=False,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.moe_intermediate_size = moe_intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_dense_layers = num_dense_layers
self.num_attention_heads = num_attention_heads
self.num_routed_experts = num_routed_experts
self.num_shared_experts = num_shared_experts
self.num_activated_experts = num_activated_experts
self.num_expert_groups = num_expert_groups
self.num_limited_groups = num_limited_groups
self.score_func = score_func
self.route_scale = route_scale
self.q_lora_rank = q_lora_rank
self.kv_lora_rank = kv_lora_rank
self.qk_nope_head_dim = qk_nope_head_dim
self.qk_rope_head_dim = qk_rope_head_dim
self.v_head_dim = v_head_dim
self.original_seq_len = original_seq_len
self.rope_theta = rope_theta
self.rope_factor = rope_factor
self.beta_fast = beta_fast
self.beta_slow = beta_slow
self.mscale = mscale
self.max_batch_size = max_batch_size
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.tie_word_embeddings = tie_word_embeddings
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
) |