cfli commited on
Commit
117d1b6
·
verified ·
1 Parent(s): 790304d

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "BAAI/Matroyshka-ReRanker-document",
3
+ "architectures": [
4
+ "CostWiseMistralForCausalLM"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "BAAI/Matroyshka-ReRanker-document--mistral_config.CostWiseMistralConfig",
8
+ "AutoModel": "BAAI/Matroyshka-ReRanker-document--mistral_model.CostWiseMistralModel",
9
+ "AutoModelForCausalLM": "BAAI/Matroyshka-ReRanker-document--mistral_model.CostWiseMistralForCausalLM"
10
+ },
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 1,
13
+ "eos_token_id": 2,
14
+ "head_dim": 128,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 4096,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 14336,
19
+ "layer_sep": 1,
20
+ "layer_wise": true,
21
+ "max_position_embeddings": 32768,
22
+ "model_type": "cost_wise_mistral",
23
+ "num_attention_heads": 32,
24
+ "num_hidden_layers": 32,
25
+ "num_key_value_heads": 8,
26
+ "rms_norm_eps": 1e-05,
27
+ "rope_theta": 10000.0,
28
+ "sliding_window": 4096,
29
+ "start_layer": 4,
30
+ "tie_word_embeddings": false,
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.46.0",
33
+ "use_cache": true,
34
+ "vocab_size": 32000
35
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.46.0"
6
+ }
mistral_config.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Mistral model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig
18
+ from transformers.utils import logging
19
+ from transformers.models.mistral.configuration_mistral import MistralConfig
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+ class CostWiseMistralConfig(MistralConfig):
24
+ r"""
25
+ This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
26
+ Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
27
+ with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1.
28
+
29
+ [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
30
+ [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
31
+
32
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
33
+ documentation from [`PretrainedConfig`] for more information.
34
+
35
+
36
+ Args:
37
+ vocab_size (`int`, *optional*, defaults to 32000):
38
+ Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the
39
+ `inputs_ids` passed when calling [`MistralModel`]
40
+ hidden_size (`int`, *optional*, defaults to 4096):
41
+ Dimension of the hidden representations.
42
+ intermediate_size (`int`, *optional*, defaults to 14336):
43
+ Dimension of the MLP representations.
44
+ num_hidden_layers (`int`, *optional*, defaults to 32):
45
+ Number of hidden layers in the Transformer encoder.
46
+ num_attention_heads (`int`, *optional*, defaults to 32):
47
+ Number of attention heads for each attention layer in the Transformer encoder.
48
+ num_key_value_heads (`int`, *optional*, defaults to 8):
49
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
50
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
51
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
52
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
53
+ by meanpooling all the original heads within that group. For more details checkout [this
54
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
55
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
56
+ The non-linear activation function (function or string) in the decoder.
57
+ max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
58
+ The maximum sequence length that this model might ever be used with. Mistral's sliding window attention
59
+ allows sequence of up to 4096*32 tokens.
60
+ initializer_range (`float`, *optional*, defaults to 0.02):
61
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
62
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
63
+ The epsilon used by the rms normalization layers.
64
+ use_cache (`bool`, *optional*, defaults to `True`):
65
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
66
+ relevant if `config.is_decoder=True`.
67
+ pad_token_id (`int`, *optional*):
68
+ The id of the padding token.
69
+ bos_token_id (`int`, *optional*, defaults to 1):
70
+ The id of the "beginning-of-sequence" token.
71
+ eos_token_id (`int`, *optional*, defaults to 2):
72
+ The id of the "end-of-sequence" token.
73
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
74
+ Whether the model's input and output word embeddings should be tied.
75
+ rope_theta (`float`, *optional*, defaults to 10000.0):
76
+ The base period of the RoPE embeddings.
77
+ sliding_window (`int`, *optional*, defaults to 4096):
78
+ Sliding window attention window size. If not specified, will default to `4096`.
79
+ attention_dropout (`float`, *optional*, defaults to 0.0):
80
+ The dropout ratio for the attention probabilities.
81
+
82
+ ```python
83
+ >>> from transformers import MistralModel, MistralConfig
84
+
85
+ >>> # Initializing a Mistral 7B style configuration
86
+ >>> configuration = MistralConfig()
87
+
88
+ >>> # Initializing a model from the Mistral 7B style configuration
89
+ >>> model = MistralModel(configuration)
90
+
91
+ >>> # Accessing the model configuration
92
+ >>> configuration = model.config
93
+ ```"""
94
+
95
+ model_type = "cost_wise_mistral"
96
+ keys_to_ignore_at_inference = ["past_key_values"]
97
+
98
+ def __init__(
99
+ self,
100
+ start_layer: int = 18,
101
+ layer_sep: int = 18,
102
+ layer_wise: bool = False,
103
+ **kwargs,
104
+ ):
105
+ self.start_layer = start_layer
106
+ self.layer_sep = layer_sep
107
+ self.layer_wise = layer_wise
108
+
109
+ super().__init__(
110
+ **kwargs,
111
+ )
mistral_model.py ADDED
@@ -0,0 +1,706 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """ PyTorch Mistral model."""
21
+ import inspect
22
+ from dataclasses import dataclass
23
+
24
+ import math
25
+ import warnings
26
+ from typing import List, Optional, Tuple, Union
27
+
28
+ import torch
29
+ import torch.nn.functional as F
30
+ import torch.utils.checkpoint
31
+ from torch import nn
32
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
33
+
34
+ from transformers.activations import ACT2FN
35
+ from transformers.cache_utils import Cache, DynamicCache
36
+ from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
37
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
38
+ from transformers.modeling_utils import PreTrainedModel
39
+ from transformers.utils import (
40
+ add_start_docstrings,
41
+ add_start_docstrings_to_model_forward,
42
+ is_flash_attn_2_available,
43
+ is_flash_attn_greater_or_equal_2_10,
44
+ logging,
45
+ replace_return_docstrings, ModelOutput,
46
+ )
47
+ from .mistral_config import CostWiseMistralConfig
48
+
49
+ from transformers.models.mistral.modeling_mistral import (
50
+ MistralRMSNorm,
51
+ MistralRotaryEmbedding,
52
+ rotate_half,
53
+ apply_rotary_pos_emb,
54
+ MistralMLP,
55
+ repeat_kv,
56
+ MistralAttention,
57
+ MistralFlashAttention2,
58
+ MistralSdpaAttention,
59
+ MISTRAL_ATTENTION_CLASSES,
60
+ MistralDecoderLayer,
61
+ MISTRAL_START_DOCSTRING,
62
+ MistralPreTrainedModel,
63
+ MISTRAL_INPUTS_DOCSTRING,
64
+
65
+ )
66
+
67
+ logger = logging.get_logger(__name__)
68
+
69
+ _CONFIG_FOR_DOC = "CostWiseMistralConfig"
70
+
71
+ @dataclass
72
+ class CostWiseModelOutputWithPast(ModelOutput):
73
+ last_hidden_state: torch.FloatTensor = None
74
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
75
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
76
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
77
+ attention_masks: Optional[Tuple[torch.FloatTensor]] = None
78
+
79
+ @dataclass
80
+ class CostWiseCausalLMOutputWithPast(ModelOutput):
81
+ loss: Optional[torch.FloatTensor] = None
82
+ logits: torch.FloatTensor = None
83
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
84
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
85
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
86
+ attention_masks: Optional[Tuple[torch.FloatTensor]] = None
87
+
88
+ def token_compress(compress_ratio,
89
+ hidden_states,
90
+ attention_mask,
91
+ query_lengths,
92
+ prompt_lengths,
93
+ weights: torch.Tensor = None):
94
+ # hidden_states = hidden_states.to('cpu')
95
+ # attention_mask = attention_mask.to('cpu')
96
+ # query_lengths = query_lengths.to('cpu')
97
+ # prompt_lengths = prompt_lengths.to('cpu')
98
+ # weights = weights.to('cpu')
99
+ # get some specific parameters
100
+ passage_lengths = torch.sum(attention_mask, dim=1, dtype=torch.int) - query_lengths - prompt_lengths # the raw passage lengths
101
+ retain_passage_lengths = (passage_lengths + compress_ratio - 1) // compress_ratio # the passage lengths need to be retained
102
+ final_useful_lengths = query_lengths + prompt_lengths + retain_passage_lengths # the final useful length after compress
103
+ max_passage_length = torch.max(passage_lengths) # the max passage lengths
104
+ max_final_lengths = torch.max(final_useful_lengths) # the max useful lengths after compress
105
+ # make new hidden states and new attention masks
106
+ new_hidden_states = torch.zeros((hidden_states.shape[0], max_final_lengths,
107
+ hidden_states.shape[-1]), dtype=hidden_states.dtype).to(hidden_states.device)
108
+ new_attention_mask = torch.ones((hidden_states.shape[0], max_final_lengths), dtype=attention_mask.dtype).to(attention_mask.device)
109
+ # get new attention mask
110
+ mask_attention_index = torch.arange(max_final_lengths, device=hidden_states.device).unsqueeze(0) >= final_useful_lengths[:, None]
111
+ new_attention_mask[mask_attention_index] = 0
112
+ # get new hidden states
113
+ # add query into new hidden states
114
+ query_index = torch.arange(max_final_lengths, device=hidden_states.device).unsqueeze(0)
115
+ mask_query_index = query_index < query_lengths[:, None]
116
+ new_hidden_states[mask_query_index] = hidden_states[:, : max_final_lengths, :][mask_query_index]
117
+ # add prompt into new hidden states
118
+ # get the index of the prompt in new hidden states
119
+ new_prompt_start_length = query_lengths + retain_passage_lengths
120
+ new_prompt_end_length = new_prompt_start_length + prompt_lengths
121
+ new_prompt_index = torch.arange(max_final_lengths, device=hidden_states.device).unsqueeze(0)
122
+ new_mask_prompt_index_start = new_prompt_index >= new_prompt_start_length[:, None]
123
+ new_mask_prompt_index_end = new_prompt_index < new_prompt_end_length[:, None]
124
+ new_mask_prompt_index = new_mask_prompt_index_start & new_mask_prompt_index_end
125
+ # get the index of the prompt in hidden states
126
+ raw_prompt_start_length = query_lengths + passage_lengths
127
+ raw_prompt_end_length = raw_prompt_start_length + prompt_lengths
128
+ raw_prompt_index = torch.arange(hidden_states.shape[1], device=hidden_states.device).unsqueeze(0)
129
+ raw_mask_prompt_index_start = raw_prompt_index >= raw_prompt_start_length[:, None]
130
+ raw_mask_prompt_index_end = raw_prompt_index < raw_prompt_end_length[:, None]
131
+ raw_mask_prompt_index = raw_mask_prompt_index_start & raw_mask_prompt_index_end
132
+ # replace the prompt hidden states
133
+ new_hidden_states[new_mask_prompt_index] = hidden_states[raw_mask_prompt_index]
134
+ # 以上均没问题
135
+
136
+ # print(new_hidden_states.view(len(new_hidden_states), -1))
137
+ # print(new_attention_mask)
138
+
139
+ # get the index of the passage in new hidden states
140
+ new_passage_start_length = query_lengths
141
+ new_passage_end_length = new_passage_start_length + retain_passage_lengths
142
+ new_passage_index = torch.arange(max_final_lengths, device=hidden_states.device).unsqueeze(0)
143
+ new_mask_passage_index_start = new_passage_index >= new_passage_start_length[:, None]
144
+ new_mask_passage_index_end = new_passage_index < new_passage_end_length[:, None]
145
+ new_mask_passage_index = new_mask_passage_index_start & new_mask_passage_index_end
146
+ # print(query_lengths, prompt_lengths, retain_passage_lengths, final_useful_lengths)
147
+ # add passage into new hidden states
148
+ # get mask hidden states
149
+ psg_start_length = query_lengths
150
+ psg_end_length = query_lengths + passage_lengths
151
+ psg_index = torch.arange(hidden_states.shape[1], device=hidden_states.device).unsqueeze(0)
152
+ mask_psg_index_start = psg_index >= psg_start_length[:, None]
153
+ mask_psg_index_end = psg_index < psg_end_length[:, None]
154
+ mask_psg_index = mask_psg_index_start & mask_psg_index_end
155
+
156
+ hidden_states = hidden_states * mask_psg_index.unsqueeze(-1)
157
+ passage_hidden_states = torch.zeros((hidden_states.shape[0],
158
+ (max_passage_length + compress_ratio - 1) // compress_ratio * compress_ratio,
159
+ hidden_states.shape[-1]), dtype=hidden_states.dtype).to(hidden_states.device)
160
+ passage_end_length = passage_lengths
161
+ passage_index = torch.arange(passage_hidden_states.shape[1], device=hidden_states.device).unsqueeze(0) # maybe exceed the max passage length
162
+ mask_passage_index = passage_index < passage_end_length[:, None]
163
+
164
+ raw_passage_end_length = query_lengths + passage_lengths
165
+ raw_passage_start_length = query_lengths
166
+ raw_passage_index = torch.arange(hidden_states.shape[1], device=hidden_states.device).unsqueeze(0)
167
+ raw_mask_passage_index_start = raw_passage_index >= raw_passage_start_length[:, None]
168
+ raw_mask_passage_index_end = raw_passage_index < raw_passage_end_length[:, None]
169
+ raw_mask_passage_index = raw_mask_passage_index_start & raw_mask_passage_index_end
170
+ passage_hidden_states[mask_passage_index] = hidden_states[raw_mask_passage_index]
171
+
172
+ passage_weights = torch.zeros((weights.shape[0],
173
+ (max_passage_length + compress_ratio - 1) // compress_ratio * compress_ratio)
174
+ , dtype=weights.dtype).to(hidden_states.device)
175
+ weights = torch.sum(weights, dim=1)
176
+ passage_weights[mask_passage_index] = weights[raw_mask_passage_index]
177
+ passage_weights = passage_weights.view(passage_weights.shape[0], -1, compress_ratio)
178
+ passage_weights = passage_weights / torch.sum(passage_weights, dim=-1
179
+ ).view(passage_weights.shape[0], -1, 1)
180
+ passage_weights = passage_weights.view(passage_weights.shape[0], -1)
181
+ # passage_weights = torch.where(passage_weights == torch.nan, 0, passage_weights)
182
+ passage_hidden_states = passage_hidden_states * passage_weights.unsqueeze(-1)
183
+ passage_hidden_states = passage_hidden_states.view(passage_hidden_states.shape[0], -1, compress_ratio,
184
+ passage_hidden_states.shape[-1])
185
+ passage_hidden_states = torch.sum(passage_hidden_states, dim=2)
186
+ passage_end_length = retain_passage_lengths
187
+ passage_index = torch.arange(passage_hidden_states.shape[1], device=hidden_states.device).unsqueeze(0)
188
+ mask_passage_index = passage_index < passage_end_length[:, None]
189
+ new_hidden_states[new_mask_passage_index] = passage_hidden_states[mask_passage_index]
190
+
191
+ return new_hidden_states, new_attention_mask
192
+
193
+ @add_start_docstrings(
194
+ "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
195
+ MISTRAL_START_DOCSTRING,
196
+ )
197
+ class CostWiseMistralModel(MistralPreTrainedModel):
198
+ """
199
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
200
+
201
+ Args:
202
+ config: MistralConfig
203
+ """
204
+
205
+ def __init__(self, config: CostWiseMistralConfig):
206
+ super().__init__(config)
207
+ self.padding_idx = config.pad_token_id
208
+ self.vocab_size = config.vocab_size
209
+
210
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
211
+ self.layers = nn.ModuleList(
212
+ [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
213
+ )
214
+ self._attn_implementation = config._attn_implementation
215
+ self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
216
+
217
+ self.gradient_checkpointing = False
218
+ # Initialize weights and apply final processing
219
+ self.post_init()
220
+
221
+ def get_input_embeddings(self):
222
+ return self.embed_tokens
223
+
224
+ def set_input_embeddings(self, value):
225
+ self.embed_tokens = value
226
+
227
+ @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
228
+ def forward(
229
+ self,
230
+ input_ids: torch.LongTensor = None,
231
+ attention_mask: Optional[torch.Tensor] = None,
232
+ position_ids: Optional[torch.LongTensor] = None,
233
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
234
+ inputs_embeds: Optional[torch.FloatTensor] = None,
235
+ use_cache: Optional[bool] = None,
236
+ output_attentions: Optional[bool] = None,
237
+ output_hidden_states: Optional[bool] = None,
238
+ return_dict: Optional[bool] = None,
239
+ compress_layer: Optional[int] = None,
240
+ compress_ratio: Optional[int] = None,
241
+ cutoff_layers: Optional[List[int]] = None,
242
+ query_lengths: Optional[int] = None,
243
+ prompt_lengths: Optional[int] = None,
244
+ ) -> Union[Tuple, CostWiseModelOutputWithPast]:
245
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
246
+
247
+ compress_ratio = None if compress_ratio == 1 else compress_ratio
248
+ if compress_layer is not None and compress_ratio is not None:
249
+ output_attentions = True
250
+
251
+ output_hidden_states = (
252
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
253
+ )
254
+
255
+ if self.config.layer_wise:
256
+ output_hidden_states = True
257
+
258
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
259
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
260
+
261
+ # retrieve input_ids and inputs_embeds
262
+ if input_ids is not None and inputs_embeds is not None:
263
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
264
+ elif input_ids is not None:
265
+ batch_size, seq_length = input_ids.shape
266
+ elif inputs_embeds is not None:
267
+ batch_size, seq_length, _ = inputs_embeds.shape
268
+ else:
269
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
270
+
271
+ if self.gradient_checkpointing and self.training:
272
+ if use_cache:
273
+ logger.warning_once(
274
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
275
+ )
276
+ use_cache = False
277
+
278
+ if compress_layer is not None and compress_ratio is not None:
279
+ logger.warning_once(
280
+ "`use_cache=True` is incompatible with reranker. Setting `use_cache=False`."
281
+ )
282
+ use_cache = False
283
+
284
+ past_key_values_length = 0
285
+
286
+ if use_cache:
287
+ use_legacy_cache = not isinstance(past_key_values, Cache)
288
+ if use_legacy_cache:
289
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
290
+ past_key_values_length = past_key_values.get_usable_length(seq_length)
291
+
292
+ if position_ids is None:
293
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
294
+ position_ids = torch.arange(
295
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
296
+ )
297
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
298
+ else:
299
+ position_ids = position_ids.view(-1, seq_length).long()
300
+
301
+ if inputs_embeds is None:
302
+ inputs_embeds = self.embed_tokens(input_ids)
303
+
304
+ if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
305
+ is_padding_right = attention_mask[:, -1].sum().item() != batch_size
306
+ if is_padding_right:
307
+ raise ValueError(
308
+ "You are attempting to perform batched generation with padding_side='right'"
309
+ " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
310
+ " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
311
+ )
312
+
313
+ if self._attn_implementation == "flash_attention_2":
314
+ # 2d mask is passed through the layers
315
+ input_attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
316
+ elif self._attn_implementation == "sdpa" and not output_attentions:
317
+ # output_attentions=True can not be supported when using SDPA, and we fall back on
318
+ # the manual implementation that requires a 4D causal mask in all cases.
319
+ input_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
320
+ attention_mask,
321
+ (batch_size, seq_length),
322
+ inputs_embeds,
323
+ past_key_values_length,
324
+ sliding_window=self.config.sliding_window,
325
+ )
326
+ else:
327
+ # 4d mask is passed through the layers
328
+ input_attention_mask = _prepare_4d_causal_attention_mask(
329
+ attention_mask,
330
+ (batch_size, seq_length),
331
+ inputs_embeds,
332
+ past_key_values_length,
333
+ sliding_window=self.config.sliding_window,
334
+ )
335
+
336
+ hidden_states = inputs_embeds
337
+
338
+ # decoder layers
339
+ all_hidden_states = () if output_hidden_states else None
340
+ all_attention_masks = ()
341
+ all_self_attns = () if output_attentions else None
342
+ next_decoder_cache = None
343
+
344
+ left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) and (
345
+ torch.sum(attention_mask) != attention_mask.shape[0] * attention_mask.shape[1])
346
+ query_lengths = [0] * hidden_states.shape[0] if query_lengths is None else query_lengths
347
+ prompt_lengths = [0] * hidden_states.shape[0] if prompt_lengths is None else prompt_lengths
348
+ if not isinstance(query_lengths, torch.Tensor):
349
+ query_lengths = torch.tensor(query_lengths, device=hidden_states.device)
350
+ if not isinstance(prompt_lengths, torch.Tensor):
351
+ prompt_lengths = torch.tensor(prompt_lengths, device=hidden_states.device)
352
+
353
+ if cutoff_layers is None:
354
+ max_layer = self.config.num_hidden_layers
355
+ cutoff_layers = [max_layer]
356
+ if isinstance(cutoff_layers, int):
357
+ max_layer = cutoff_layers
358
+ cutoff_layers = [cutoff_layers]
359
+ else:
360
+ max_layer = max(cutoff_layers)
361
+
362
+ for idx, decoder_layer in enumerate(self.layers):
363
+ if self.config.layer_wise:
364
+ if idx in cutoff_layers and output_hidden_states:
365
+ all_hidden_states += (self.norm(hidden_states),)
366
+ all_attention_masks += (attention_mask,)
367
+ if idx == max_layer:
368
+ break
369
+ elif output_hidden_states:
370
+ all_hidden_states += (hidden_states,)
371
+
372
+ if compress_layer is not None and compress_ratio is not None and idx in compress_layer and idx != 0:
373
+ # if all_self_attns is not None:
374
+ # # weights = all_self_attns[-1][:, :, -1, :]
375
+ # weights = all_self_attns
376
+ # else:
377
+ # weights = None
378
+
379
+ if left_padding:
380
+ raise ValueError('You must use right padding...')
381
+ hidden_states, attention_mask = token_compress(compress_ratio, hidden_states, attention_mask,
382
+ query_lengths, prompt_lengths, all_self_attns)
383
+ torch.cuda.empty_cache()
384
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
385
+ seq_length = hidden_states.shape[1]
386
+ position_ids = torch.arange(
387
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
388
+ )
389
+ position_ids = position_ids.unsqueeze(0)
390
+ if self._attn_implementation == "flash_attention_2":
391
+ # 2d mask is passed through the layers
392
+ input_attention_mask = attention_mask if (
393
+ attention_mask is not None and 0 in attention_mask) else None
394
+ elif self._attn_implementation == "sdpa" and not output_attentions:
395
+ # output_attentions=True can not be supported when using SDPA, and we fall back on
396
+ # the manual implementation that requires a 4D causal mask in all cases.
397
+ input_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
398
+ attention_mask,
399
+ (batch_size, seq_length),
400
+ inputs_embeds,
401
+ past_key_values_length,
402
+ )
403
+ else:
404
+ # 4d mask is passed through the layers
405
+ input_attention_mask = _prepare_4d_causal_attention_mask(
406
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
407
+ )
408
+
409
+ if self.gradient_checkpointing and self.training:
410
+ layer_outputs = self._gradient_checkpointing_func(
411
+ decoder_layer.__call__,
412
+ hidden_states,
413
+ input_attention_mask,
414
+ position_ids,
415
+ past_key_values,
416
+ output_attentions,
417
+ use_cache,
418
+ )
419
+ else:
420
+ layer_outputs = decoder_layer(
421
+ hidden_states,
422
+ attention_mask=input_attention_mask,
423
+ position_ids=position_ids,
424
+ past_key_value=past_key_values,
425
+ output_attentions=output_attentions,
426
+ use_cache=use_cache,
427
+ )
428
+
429
+ hidden_states = layer_outputs[0]
430
+
431
+ if use_cache:
432
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
433
+
434
+ if output_attentions:
435
+ # all_self_attns += (layer_outputs[1],)
436
+ all_self_attns = layer_outputs[1][:, :, -1, :]
437
+
438
+ hidden_states = self.norm(hidden_states)
439
+
440
+ # add hidden states from the last decoder layer
441
+ if not self.config.layer_wise:
442
+ if output_hidden_states:
443
+ all_hidden_states += (hidden_states,)
444
+ all_attention_masks += (attention_mask,)
445
+ else:
446
+ if output_hidden_states and self.config.num_hidden_layers == max_layer:
447
+ all_hidden_states += (hidden_states,)
448
+ all_attention_masks += (attention_mask,)
449
+
450
+ next_cache = None
451
+ if use_cache:
452
+ next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
453
+
454
+ torch.cuda.empty_cache()
455
+
456
+ if not return_dict:
457
+ return tuple(
458
+ v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_attention_masks] if
459
+ v is not None)
460
+ return CostWiseModelOutputWithPast(
461
+ last_hidden_state=hidden_states,
462
+ past_key_values=next_cache,
463
+ hidden_states=all_hidden_states,
464
+ attentions=all_self_attns,
465
+ attention_masks=all_attention_masks
466
+ )
467
+
468
+ class CostWiseHead(nn.Module):
469
+ """Head for sentence-level classification tasks."""
470
+
471
+ def __init__(self, input_size, output_size):
472
+ super().__init__()
473
+ self.linear_head = nn.Linear(input_size, output_size, bias=False)
474
+
475
+ def forward(self, **kwargs):
476
+ return self.linear_head(**kwargs)
477
+
478
+ class CostWiseMistralForCausalLM(MistralPreTrainedModel):
479
+ _tied_weights_keys = ["lm_head.weight"]
480
+
481
+ def __init__(self, config):
482
+ super().__init__(config)
483
+ self.model = CostWiseMistralModel(config)
484
+ self.vocab_size = config.vocab_size
485
+ if not config.layer_wise:
486
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
487
+ else:
488
+ self.lm_head = nn.ModuleList(
489
+ [CostWiseHead(config.hidden_size, 1) for _ in range(
490
+ config.start_layer, config.num_hidden_layers + 1, config.layer_sep
491
+ )]
492
+ )
493
+
494
+ # Initialize weights and apply final processing
495
+ self.post_init()
496
+
497
+ def get_input_embeddings(self):
498
+ return self.model.embed_tokens
499
+
500
+ def set_input_embeddings(self, value):
501
+ self.model.embed_tokens = value
502
+
503
+ def get_output_embeddings(self):
504
+ return self.lm_head
505
+
506
+ def set_output_embeddings(self, new_embeddings):
507
+ self.lm_head = new_embeddings
508
+
509
+ def set_decoder(self, decoder):
510
+ self.model = decoder
511
+
512
+ def get_decoder(self):
513
+ return self.model
514
+
515
+ @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
516
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
517
+ def forward(
518
+ self,
519
+ input_ids: torch.LongTensor = None,
520
+ attention_mask: Optional[torch.Tensor] = None,
521
+ position_ids: Optional[torch.LongTensor] = None,
522
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
523
+ inputs_embeds: Optional[torch.FloatTensor] = None,
524
+ labels: Optional[torch.LongTensor] = None,
525
+ use_cache: Optional[bool] = None,
526
+ output_attentions: Optional[bool] = None,
527
+ output_hidden_states: Optional[bool] = None,
528
+ return_dict: Optional[bool] = None,
529
+ compress_layer: Optional[int] = None,
530
+ compress_ratio: Optional[int] = None,
531
+ cutoff_layers: Optional[List[int]] = None,
532
+ query_lengths: Optional[int] = None,
533
+ prompt_lengths: Optional[int] = None,
534
+ ) -> Union[Tuple, CostWiseCausalLMOutputWithPast]:
535
+ r"""
536
+ Args:
537
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
538
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
539
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
540
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
541
+
542
+ Returns:
543
+
544
+ Example:
545
+
546
+ ```python
547
+ >>> from transformers import AutoTokenizer, MistralForCausalLM
548
+
549
+ >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
550
+ >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
551
+
552
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
553
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
554
+
555
+ >>> # Generate
556
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
557
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
558
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
559
+ ```"""
560
+
561
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
562
+ output_hidden_states = (
563
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
564
+ )
565
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
566
+
567
+ if compress_ratio is not None and compress_ratio == 1:
568
+ compress_ratio = None
569
+
570
+ if self.config.layer_wise:
571
+ if cutoff_layers is None:
572
+ cutoff_layers = [self.config.num_hidden_layers]
573
+ elif isinstance(cutoff_layers, int):
574
+ cutoff_layers = [cutoff_layers]
575
+ can_use_layers = list(range(self.config.start_layer, self.config.num_hidden_layers + 1, self.config.layer_sep))
576
+ remove_layers = [i for i in cutoff_layers if i not in can_use_layers]
577
+ if len(remove_layers) > 0:
578
+ logger.warning_once(
579
+ f"layers {remove_layers} are incompatible with the setting. They will be removed..."
580
+ )
581
+ cutoff_layers = [i for i in cutoff_layers if i not in remove_layers]
582
+ if len(cutoff_layers) == 0:
583
+ raise ValueError(f"Your cutoff layers must in [{self.config.start_layer}, {self.config.num_hidden_layers}]")
584
+
585
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
586
+ outputs = self.model(
587
+ input_ids=input_ids,
588
+ attention_mask=attention_mask,
589
+ position_ids=position_ids,
590
+ past_key_values=past_key_values,
591
+ inputs_embeds=inputs_embeds,
592
+ use_cache=use_cache,
593
+ output_attentions=output_attentions,
594
+ output_hidden_states=output_hidden_states,
595
+ return_dict=return_dict,
596
+ compress_layer=compress_layer,
597
+ compress_ratio=compress_ratio,
598
+ query_lengths=query_lengths,
599
+ prompt_lengths=prompt_lengths,
600
+ cutoff_layers=cutoff_layers
601
+ )
602
+
603
+ if not self.config.layer_wise:
604
+ hidden_states = outputs[0]
605
+ logits = self.lm_head(hidden_states)
606
+ logits = logits.float()
607
+ loss = None
608
+ if labels is not None:
609
+ # Shift so that tokens < n predict n
610
+ shift_logits = logits[..., :-1, :].contiguous()
611
+ shift_labels = labels[..., 1:].contiguous()
612
+ # Flatten the tokens
613
+ loss_fct = CrossEntropyLoss()
614
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
615
+ shift_labels = shift_labels.view(-1)
616
+ # Enable model parallelism
617
+ shift_labels = shift_labels.to(shift_logits.device)
618
+ loss = loss_fct(shift_logits, shift_labels)
619
+ else:
620
+ hidden_states = outputs.hidden_states
621
+ logits = ()
622
+ for i in range(len(hidden_states)):
623
+ tmp_logits = self.lm_head[i].linear_head(hidden_states[i])
624
+ tmp_logits = tmp_logits.float()
625
+ tmp_logits = tmp_logits.reshape(hidden_states[i].shape[0], -1)
626
+ logits = logits + (tmp_logits,)
627
+ loss = None
628
+
629
+ if not return_dict:
630
+ output = (logits,) + outputs[1:]
631
+ return (loss,) + output if loss is not None else output
632
+
633
+ return CostWiseCausalLMOutputWithPast(
634
+ loss=loss,
635
+ logits=logits,
636
+ past_key_values=outputs.past_key_values,
637
+ hidden_states=outputs.hidden_states,
638
+ attentions=outputs.attentions,
639
+ attention_masks=outputs[-1] if self.model.config.layer_wise else outputs[-1][-1]
640
+ )
641
+
642
+ def prepare_inputs_for_generation(
643
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
644
+ ):
645
+ # Omit tokens covered by past_key_values
646
+ if past_key_values is not None:
647
+ if isinstance(past_key_values, Cache):
648
+ cache_length = past_key_values.get_seq_length()
649
+ past_length = past_key_values.seen_tokens
650
+ max_cache_length = past_key_values.get_max_length()
651
+ else:
652
+ cache_length = past_length = past_key_values[0][0].shape[2]
653
+ max_cache_length = None
654
+
655
+ # Keep only the unprocessed tokens:
656
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
657
+ # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
658
+ # input)
659
+ if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
660
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
661
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
662
+ # input_ids based on the past_length.
663
+ elif past_length < input_ids.shape[1]:
664
+ input_ids = input_ids[:, past_length:]
665
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
666
+
667
+ # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
668
+ if (
669
+ max_cache_length is not None
670
+ and attention_mask is not None
671
+ and cache_length + input_ids.shape[1] > max_cache_length
672
+ ):
673
+ attention_mask = attention_mask[:, -max_cache_length:]
674
+
675
+ position_ids = kwargs.get("position_ids", None)
676
+ if attention_mask is not None and position_ids is None:
677
+ # create position_ids on the fly for batch generation
678
+ position_ids = attention_mask.long().cumsum(-1) - 1
679
+ position_ids.masked_fill_(attention_mask == 0, 1)
680
+ if past_key_values:
681
+ position_ids = position_ids[:, -input_ids.shape[1] :]
682
+
683
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
684
+ if inputs_embeds is not None and past_key_values is None:
685
+ model_inputs = {"inputs_embeds": inputs_embeds}
686
+ else:
687
+ model_inputs = {"input_ids": input_ids}
688
+
689
+ model_inputs.update(
690
+ {
691
+ "position_ids": position_ids,
692
+ "past_key_values": past_key_values,
693
+ "use_cache": kwargs.get("use_cache"),
694
+ "attention_mask": attention_mask,
695
+ }
696
+ )
697
+ return model_inputs
698
+
699
+ @staticmethod
700
+ def _reorder_cache(past_key_values, beam_idx):
701
+ reordered_past = ()
702
+ for layer_past in past_key_values:
703
+ reordered_past += (
704
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
705
+ )
706
+ return reordered_past
model-00001-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4754bece3b0299c14a2d361b81de3591d2f49ba0f68fbc0f5e1eac8549ed829f
3
+ size 4987196936
model-00002-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e867fc5dd3e522a17ea4c39009950f93720171409f12e18aeec2b4ae4cbeb85
3
+ size 4899116440
model-00003-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ba0ba3a4f6a6ae7997c79cd9103208ce2ddb83cc9a45b2555309f434330eff3
3
+ size 4999813120
model-00004-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:350f054112733d572854a0c72425330f87bbf8329c238497ae7fc4248db7965e
3
+ size 4999813128
model-00005-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07b574afe44404b5ab28082f39d6943a9601a949b8e3391951b33d4c35b3b2a8
3
+ size 4832007496
model-00006-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46a074ec71141fa65ae3536fa24d47a777fc99dd7b40bd56e0a59c53f3f44fa3
3
+ size 3725204688
model.safetensors.index.json ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 28443115520
4
+ },
5
+ "weight_map": {
6
+ "lm_head.0.linear_head.weight": "model-00006-of-00006.safetensors",
7
+ "lm_head.1.linear_head.weight": "model-00006-of-00006.safetensors",
8
+ "lm_head.10.linear_head.weight": "model-00006-of-00006.safetensors",
9
+ "lm_head.11.linear_head.weight": "model-00006-of-00006.safetensors",
10
+ "lm_head.12.linear_head.weight": "model-00006-of-00006.safetensors",
11
+ "lm_head.13.linear_head.weight": "model-00006-of-00006.safetensors",
12
+ "lm_head.14.linear_head.weight": "model-00006-of-00006.safetensors",
13
+ "lm_head.15.linear_head.weight": "model-00006-of-00006.safetensors",
14
+ "lm_head.16.linear_head.weight": "model-00006-of-00006.safetensors",
15
+ "lm_head.17.linear_head.weight": "model-00006-of-00006.safetensors",
16
+ "lm_head.18.linear_head.weight": "model-00006-of-00006.safetensors",
17
+ "lm_head.19.linear_head.weight": "model-00006-of-00006.safetensors",
18
+ "lm_head.2.linear_head.weight": "model-00006-of-00006.safetensors",
19
+ "lm_head.20.linear_head.weight": "model-00006-of-00006.safetensors",
20
+ "lm_head.21.linear_head.weight": "model-00006-of-00006.safetensors",
21
+ "lm_head.22.linear_head.weight": "model-00006-of-00006.safetensors",
22
+ "lm_head.23.linear_head.weight": "model-00006-of-00006.safetensors",
23
+ "lm_head.24.linear_head.weight": "model-00006-of-00006.safetensors",
24
+ "lm_head.25.linear_head.weight": "model-00006-of-00006.safetensors",
25
+ "lm_head.26.linear_head.weight": "model-00006-of-00006.safetensors",
26
+ "lm_head.27.linear_head.weight": "model-00006-of-00006.safetensors",
27
+ "lm_head.28.linear_head.weight": "model-00006-of-00006.safetensors",
28
+ "lm_head.3.linear_head.weight": "model-00006-of-00006.safetensors",
29
+ "lm_head.4.linear_head.weight": "model-00006-of-00006.safetensors",
30
+ "lm_head.5.linear_head.weight": "model-00006-of-00006.safetensors",
31
+ "lm_head.6.linear_head.weight": "model-00006-of-00006.safetensors",
32
+ "lm_head.7.linear_head.weight": "model-00006-of-00006.safetensors",
33
+ "lm_head.8.linear_head.weight": "model-00006-of-00006.safetensors",
34
+ "lm_head.9.linear_head.weight": "model-00006-of-00006.safetensors",
35
+ "model.embed_tokens.weight": "model-00001-of-00006.safetensors",
36
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00006.safetensors",
37
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
38
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
39
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
40
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
41
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
42
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
43
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
44
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
45
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00006.safetensors",
46
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
47
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
48
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
49
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
50
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
51
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
52
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
53
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
54
+ "model.layers.10.input_layernorm.weight": "model-00003-of-00006.safetensors",
55
+ "model.layers.10.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
56
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
57
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
58
+ "model.layers.10.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
59
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
60
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
61
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
62
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
63
+ "model.layers.11.input_layernorm.weight": "model-00003-of-00006.safetensors",
64
+ "model.layers.11.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
65
+ "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
66
+ "model.layers.11.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
67
+ "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
68
+ "model.layers.11.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
69
+ "model.layers.11.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
70
+ "model.layers.11.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
71
+ "model.layers.11.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
72
+ "model.layers.12.input_layernorm.weight": "model-00003-of-00006.safetensors",
73
+ "model.layers.12.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
74
+ "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
75
+ "model.layers.12.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
76
+ "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
77
+ "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
78
+ "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
79
+ "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
80
+ "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
81
+ "model.layers.13.input_layernorm.weight": "model-00003-of-00006.safetensors",
82
+ "model.layers.13.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
83
+ "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
84
+ "model.layers.13.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
85
+ "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
86
+ "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
87
+ "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
88
+ "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
89
+ "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
90
+ "model.layers.14.input_layernorm.weight": "model-00003-of-00006.safetensors",
91
+ "model.layers.14.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
92
+ "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
93
+ "model.layers.14.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
94
+ "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
95
+ "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
96
+ "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
97
+ "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
98
+ "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
99
+ "model.layers.15.input_layernorm.weight": "model-00003-of-00006.safetensors",
100
+ "model.layers.15.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
101
+ "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
102
+ "model.layers.15.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
103
+ "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
104
+ "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
105
+ "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
106
+ "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
107
+ "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
108
+ "model.layers.16.input_layernorm.weight": "model-00004-of-00006.safetensors",
109
+ "model.layers.16.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
110
+ "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
111
+ "model.layers.16.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
112
+ "model.layers.16.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
113
+ "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
114
+ "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
115
+ "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
116
+ "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
117
+ "model.layers.17.input_layernorm.weight": "model-00004-of-00006.safetensors",
118
+ "model.layers.17.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
119
+ "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
120
+ "model.layers.17.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
121
+ "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
124
+ "model.layers.17.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
125
+ "model.layers.17.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
126
+ "model.layers.18.input_layernorm.weight": "model-00004-of-00006.safetensors",
127
+ "model.layers.18.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
128
+ "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
129
+ "model.layers.18.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
130
+ "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
131
+ "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
132
+ "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
133
+ "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
134
+ "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
135
+ "model.layers.19.input_layernorm.weight": "model-00004-of-00006.safetensors",
136
+ "model.layers.19.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
137
+ "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
138
+ "model.layers.19.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
139
+ "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
140
+ "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
141
+ "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
142
+ "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
143
+ "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
144
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00006.safetensors",
145
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
146
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
147
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
148
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
149
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
150
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
151
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
152
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
153
+ "model.layers.20.input_layernorm.weight": "model-00004-of-00006.safetensors",
154
+ "model.layers.20.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
155
+ "model.layers.20.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
156
+ "model.layers.20.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
157
+ "model.layers.20.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
158
+ "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
159
+ "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
160
+ "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
161
+ "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
162
+ "model.layers.21.input_layernorm.weight": "model-00004-of-00006.safetensors",
163
+ "model.layers.21.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
164
+ "model.layers.21.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
165
+ "model.layers.21.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
166
+ "model.layers.21.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
167
+ "model.layers.21.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
168
+ "model.layers.21.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
169
+ "model.layers.21.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
170
+ "model.layers.21.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
171
+ "model.layers.22.input_layernorm.weight": "model-00005-of-00006.safetensors",
172
+ "model.layers.22.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
173
+ "model.layers.22.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
174
+ "model.layers.22.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
175
+ "model.layers.22.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
176
+ "model.layers.22.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
177
+ "model.layers.22.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
178
+ "model.layers.22.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
179
+ "model.layers.22.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
180
+ "model.layers.23.input_layernorm.weight": "model-00005-of-00006.safetensors",
181
+ "model.layers.23.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
182
+ "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
183
+ "model.layers.23.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
184
+ "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
185
+ "model.layers.23.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
186
+ "model.layers.23.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
187
+ "model.layers.23.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
188
+ "model.layers.23.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
189
+ "model.layers.24.input_layernorm.weight": "model-00005-of-00006.safetensors",
190
+ "model.layers.24.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
191
+ "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
192
+ "model.layers.24.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
193
+ "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
194
+ "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
195
+ "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
196
+ "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
197
+ "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
198
+ "model.layers.25.input_layernorm.weight": "model-00005-of-00006.safetensors",
199
+ "model.layers.25.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
200
+ "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
201
+ "model.layers.25.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
202
+ "model.layers.25.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
203
+ "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
204
+ "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
205
+ "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
206
+ "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
207
+ "model.layers.26.input_layernorm.weight": "model-00005-of-00006.safetensors",
208
+ "model.layers.26.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
209
+ "model.layers.26.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
210
+ "model.layers.26.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
211
+ "model.layers.26.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
212
+ "model.layers.26.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
213
+ "model.layers.26.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
214
+ "model.layers.26.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
215
+ "model.layers.26.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
216
+ "model.layers.27.input_layernorm.weight": "model-00006-of-00006.safetensors",
217
+ "model.layers.27.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
218
+ "model.layers.27.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
219
+ "model.layers.27.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
220
+ "model.layers.27.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
221
+ "model.layers.27.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
222
+ "model.layers.27.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
223
+ "model.layers.27.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
224
+ "model.layers.27.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
225
+ "model.layers.28.input_layernorm.weight": "model-00006-of-00006.safetensors",
226
+ "model.layers.28.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
227
+ "model.layers.28.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
228
+ "model.layers.28.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
229
+ "model.layers.28.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
230
+ "model.layers.28.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
231
+ "model.layers.28.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
232
+ "model.layers.28.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
233
+ "model.layers.28.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
234
+ "model.layers.29.input_layernorm.weight": "model-00006-of-00006.safetensors",
235
+ "model.layers.29.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
236
+ "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
237
+ "model.layers.29.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
238
+ "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
239
+ "model.layers.29.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
240
+ "model.layers.29.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
241
+ "model.layers.29.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
242
+ "model.layers.29.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
243
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00006.safetensors",
244
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
245
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
246
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
247
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
248
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
249
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
250
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
251
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
252
+ "model.layers.30.input_layernorm.weight": "model-00006-of-00006.safetensors",
253
+ "model.layers.30.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
254
+ "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
255
+ "model.layers.30.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
256
+ "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
257
+ "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
258
+ "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
259
+ "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
260
+ "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
261
+ "model.layers.31.input_layernorm.weight": "model-00006-of-00006.safetensors",
262
+ "model.layers.31.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
263
+ "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
264
+ "model.layers.31.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
265
+ "model.layers.31.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
266
+ "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
267
+ "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
268
+ "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
269
+ "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
270
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00006.safetensors",
271
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
272
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
273
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
274
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
275
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
276
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
277
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
278
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
279
+ "model.layers.5.input_layernorm.weight": "model-00002-of-00006.safetensors",
280
+ "model.layers.5.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
281
+ "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
282
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
283
+ "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
284
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
285
+ "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
286
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
287
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
288
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00006.safetensors",
289
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
290
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
291
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
292
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
293
+ "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
294
+ "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
295
+ "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
296
+ "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
297
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00006.safetensors",
298
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
299
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
300
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
301
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
302
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
303
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
304
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
305
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
306
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00006.safetensors",
307
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
308
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
309
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
310
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
311
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
312
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
313
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
314
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
315
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00006.safetensors",
316
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
317
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
318
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
319
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
320
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
321
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
322
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
323
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
324
+ "model.norm.weight": "model-00006-of-00006.safetensors"
325
+ }
326
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": true,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "<unk>",
38
+ "sp_model_kwargs": {},
39
+ "spaces_between_special_tokens": false,
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }