Paramecium880 commited on
Commit
e601f99
·
verified ·
1 Parent(s): 33d8679

Create ling_gptq.patch

Browse files
Files changed (1) hide show
  1. ling_gptq.patch +351 -0
ling_gptq.patch ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --- vllm/model_executor/models/deepseek.py 2025-04-03 11:17:01.787109116 +0800
2
+ +++ ling_vllm_patch_a.py 2025-04-02 20:53:47.649000000 +0800
3
+ @@ -21,7 +21,7 @@
4
+ # See the License for the specific language governing permissions and
5
+ # limitations under the License.
6
+ """Inference-only Deepseek model."""
7
+ -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
8
+ +from typing import Any, Dict, Iterable, List, Optional, Tuple
9
+
10
+ import torch
11
+ from torch import nn
12
+ @@ -29,18 +29,19 @@
13
+
14
+ from vllm.attention import Attention, AttentionMetadata
15
+ from vllm.config import CacheConfig
16
+ -from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
17
+ +from vllm.distributed import (get_tensor_model_parallel_rank,
18
+ get_tensor_model_parallel_world_size,
19
+ tensor_model_parallel_all_reduce)
20
+ from vllm.model_executor.layers.activation import SiluAndMul
21
+ -from vllm.model_executor.layers.fused_moe import fused_moe
22
+ +from vllm.model_executor.layers.fused_moe import FusedMoE
23
+ from vllm.model_executor.layers.layernorm import RMSNorm
24
+ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
25
+ QKVParallelLinear,
26
+ ReplicatedLinear,
27
+ RowParallelLinear)
28
+ from vllm.model_executor.layers.logits_processor import LogitsProcessor
29
+ -from vllm.model_executor.layers.quantization import QuantizationConfig
30
+ +from vllm.model_executor.layers.quantization.base_config import (
31
+ + QuantizationConfig)
32
+ from vllm.model_executor.layers.rotary_embedding import get_rope
33
+ from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
34
+ from vllm.model_executor.layers.vocab_parallel_embedding import (
35
+ @@ -49,10 +50,6 @@
36
+ from vllm.model_executor.sampling_metadata import SamplingMetadata
37
+ from vllm.sequence import IntermediateTensors
38
+
39
+ -from .interfaces import SupportsPP
40
+ -from .utils import (is_pp_missing_parameter,
41
+ - make_empty_intermediate_tensors_factory, make_layers)
42
+ -
43
+
44
+ class DeepseekMLP(nn.Module):
45
+
46
+ @@ -91,6 +88,7 @@
47
+ def __init__(
48
+ self,
49
+ config: PretrainedConfig,
50
+ + layer_idx: int,
51
+ quant_config: Optional[QuantizationConfig] = None,
52
+ ):
53
+ super().__init__()
54
+ @@ -104,15 +102,17 @@
55
+ f"Tensor parallel size {self.tp_size} is greater than "
56
+ f"the number of experts {self.n_routed_experts}.")
57
+
58
+ - self.experts = nn.ModuleList([
59
+ - DeepseekMLP(hidden_size=config.hidden_size,
60
+ - intermediate_size=config.moe_intermediate_size,
61
+ - hidden_act=config.hidden_act,
62
+ - quant_config=quant_config,
63
+ - reduce_results=False)
64
+ - for idx in range(self.n_routed_experts)
65
+ - ])
66
+ - self.pack_params()
67
+ + self.experts = FusedMoE(
68
+ + num_experts=config.n_routed_experts,
69
+ + top_k=config.num_experts_per_tok,
70
+ + hidden_size=config.hidden_size,
71
+ + intermediate_size=config.moe_intermediate_size,
72
+ + reduce_results=False,
73
+ + renormalize=config.norm_topk_prob,
74
+ + quant_config=quant_config,
75
+ + use_grouped_topk=False,
76
+ + prefix=f"model.layers.{layer_idx}.mlp.experts"
77
+ + )
78
+
79
+ self.gate = ReplicatedLinear(config.hidden_size,
80
+ self.n_routed_experts,
81
+ @@ -130,25 +130,6 @@
82
+ reduce_results=False,
83
+ )
84
+
85
+ - def pack_params(self):
86
+ - w1 = []
87
+ - w2 = []
88
+ - for expert in self.experts:
89
+ - w1.append(expert.gate_up_proj.weight)
90
+ - w2.append(expert.down_proj.weight)
91
+ - self.w1 = torch._utils._flatten_dense_tensors(w1)
92
+ - w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
93
+ - for data, param in zip(w1s, w1):
94
+ - param.data = data
95
+ - self.w1 = self.w1.view(len(w1), *w1s[0].shape)
96
+ -
97
+ - self.w2 = torch._utils._flatten_dense_tensors(w2)
98
+ - w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
99
+ - for data, param in zip(w2s, w2):
100
+ - param.data = data
101
+ -
102
+ - self.w2 = self.w2.view(len(w2), *w2s[0].shape)
103
+ -
104
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
105
+ num_tokens, hidden_dim = hidden_states.shape
106
+ hidden_states = hidden_states.view(-1, hidden_dim)
107
+ @@ -156,18 +137,14 @@
108
+ shared_output = self.shared_experts(hidden_states)
109
+ # router_logits: (num_tokens, n_experts)
110
+ router_logits, _ = self.gate(hidden_states)
111
+ - final_hidden_states = fused_moe(hidden_states,
112
+ - self.w1,
113
+ - self.w2,
114
+ - router_logits,
115
+ - self.top_k,
116
+ - renormalize=self.config.norm_topk_prob,
117
+ - inplace=True)
118
+ + final_hidden_states = self.experts(hidden_states=hidden_states,
119
+ + router_logits=router_logits)
120
+
121
+ - if self.config.n_shared_experts is not None:
122
+ + if shared_output is not None:
123
+ final_hidden_states = final_hidden_states + shared_output
124
+ - final_hidden_states = tensor_model_parallel_all_reduce(
125
+ - final_hidden_states)
126
+ + if self.tp_size > 1:
127
+ + final_hidden_states = tensor_model_parallel_all_reduce(
128
+ + final_hidden_states)
129
+
130
+ return final_hidden_states.view(num_tokens, hidden_dim)
131
+
132
+ @@ -179,6 +156,7 @@
133
+ hidden_size: int,
134
+ num_heads: int,
135
+ num_kv_heads: int,
136
+ + head_dim: int,
137
+ rope_theta: float = 10000,
138
+ rope_scaling: Optional[Dict[str, Any]] = None,
139
+ max_position_embeddings: int = 8192,
140
+ @@ -201,7 +179,8 @@
141
+ # the KV heads across multiple tensor parallel GPUs.
142
+ assert tp_size % self.total_num_kv_heads == 0
143
+ self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
144
+ - self.head_dim = hidden_size // self.total_num_heads
145
+ + # self.head_dim = hidden_size // self.total_num_heads
146
+ + self.head_dim = hidden_size // self.total_num_heads if head_dim is None else head_dim
147
+ self.q_size = self.num_heads * self.head_dim
148
+ self.kv_size = self.num_kv_heads * self.head_dim
149
+ self.scaling = self.head_dim**-0.5
150
+ @@ -268,10 +247,12 @@
151
+ rope_scaling = getattr(config, "rope_scaling", None)
152
+ max_position_embeddings = getattr(config, "max_position_embeddings",
153
+ 8192)
154
+ + head_dim = getattr(config, "head_dim", None)
155
+ self.self_attn = DeepseekAttention(
156
+ hidden_size=self.hidden_size,
157
+ num_heads=config.num_attention_heads,
158
+ num_kv_heads=config.num_key_value_heads,
159
+ + head_dim=head_dim,
160
+ rope_theta=rope_theta,
161
+ rope_scaling=rope_scaling,
162
+ max_position_embeddings=max_position_embeddings,
163
+ @@ -281,7 +262,7 @@
164
+ if (config.n_routed_experts is not None
165
+ and layer_idx >= config.first_k_dense_replace
166
+ and layer_idx % config.moe_layer_freq == 0):
167
+ - self.mlp = DeepseekMoE(config=config, quant_config=quant_config)
168
+ + self.mlp = DeepseekMoE(config=config, quant_config=quant_config, layer_idx=layer_idx)
169
+ else:
170
+ self.mlp = DeepseekMLP(
171
+ hidden_size=config.hidden_size,
172
+ @@ -332,7 +313,6 @@
173
+ config: PretrainedConfig,
174
+ cache_config: Optional[CacheConfig] = None,
175
+ quant_config: Optional[QuantizationConfig] = None,
176
+ - prefix: str = "",
177
+ ) -> None:
178
+ super().__init__()
179
+ self.padding_idx = config.pad_token_id
180
+ @@ -342,17 +322,14 @@
181
+ config.vocab_size,
182
+ config.hidden_size,
183
+ )
184
+ - self.start_layer, self.end_layer, self.layers = make_layers(
185
+ - config.num_hidden_layers,
186
+ - lambda prefix: DeepseekDecoderLayer(config,
187
+ - int(prefix.split(".")[-1]),
188
+ - cache_config,
189
+ - quant_config=quant_config),
190
+ - prefix=f"{prefix}.layers")
191
+ + self.layers = nn.ModuleList([
192
+ + DeepseekDecoderLayer(config,
193
+ + layer_idx,
194
+ + cache_config,
195
+ + quant_config=quant_config)
196
+ + for layer_idx in range(config.num_hidden_layers)
197
+ + ])
198
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
199
+ - self.make_empty_intermediate_tensors = (
200
+ - make_empty_intermediate_tensors_factory(
201
+ - ["hidden_states", "residual"], config.hidden_size))
202
+
203
+ def forward(
204
+ self,
205
+ @@ -360,29 +337,19 @@
206
+ positions: torch.Tensor,
207
+ kv_caches: List[torch.Tensor],
208
+ attn_metadata: AttentionMetadata,
209
+ - intermediate_tensors: Optional[IntermediateTensors],
210
+ - ) -> Union[torch.Tensor, IntermediateTensors]:
211
+ - if get_pp_group().is_first_rank:
212
+ - hidden_states = self.embed_tokens(input_ids)
213
+ - residual = None
214
+ - else:
215
+ - hidden_states = intermediate_tensors["hidden_states"]
216
+ - residual = intermediate_tensors["residual"]
217
+ - for i in range(self.start_layer, self.end_layer):
218
+ + ) -> torch.Tensor:
219
+ + hidden_states = self.embed_tokens(input_ids)
220
+ + residual = None
221
+ + for i in range(len(self.layers)):
222
+ layer = self.layers[i]
223
+ hidden_states, residual = layer(positions, hidden_states,
224
+ - kv_caches[i - self.start_layer],
225
+ - attn_metadata, residual)
226
+ - if not get_pp_group().is_last_rank:
227
+ - return IntermediateTensors({
228
+ - "hidden_states": hidden_states,
229
+ - "residual": residual
230
+ - })
231
+ + kv_caches[i], attn_metadata,
232
+ + residual)
233
+ hidden_states, _ = self.norm(hidden_states, residual)
234
+ return hidden_states
235
+
236
+
237
+ -class DeepseekForCausalLM(nn.Module, SupportsPP):
238
+ +class DeepseekForCausalLM(nn.Module):
239
+
240
+ def __init__(
241
+ self,
242
+ @@ -401,8 +368,6 @@
243
+ self.lm_head.weight = self.model.embed_tokens.weight
244
+ self.logits_processor = LogitsProcessor(config.vocab_size)
245
+ self.sampler = Sampler()
246
+ - self.make_empty_intermediate_tensors = (
247
+ - self.model.make_empty_intermediate_tensors)
248
+
249
+ def forward(
250
+ self,
251
+ @@ -411,9 +376,9 @@
252
+ kv_caches: List[torch.Tensor],
253
+ attn_metadata: AttentionMetadata,
254
+ intermediate_tensors: Optional[IntermediateTensors] = None,
255
+ - ) -> Union[torch.Tensor, IntermediateTensors]:
256
+ + ) -> torch.Tensor:
257
+ hidden_states = self.model(input_ids, positions, kv_caches,
258
+ - attn_metadata, intermediate_tensors)
259
+ + attn_metadata)
260
+ return hidden_states
261
+
262
+ def compute_logits(
263
+ @@ -443,6 +408,15 @@
264
+ ("gate_up_proj", "up_proj", 1),
265
+ ]
266
+
267
+ + # Params for weights, fp8 weight scales, fp8 activation scales
268
+ + # (param_name, weight_name, expert_id, shard_id)
269
+ + expert_params_mapping = FusedMoE.make_expert_params_mapping(
270
+ + ckpt_gate_proj_name="gate_proj",
271
+ + ckpt_down_proj_name="down_proj",
272
+ + ckpt_up_proj_name="up_proj",
273
+ + num_experts=self.config.n_routed_experts,
274
+ + )
275
+ +
276
+ params_dict = dict(self.named_parameters())
277
+ for name, loaded_weight in weights:
278
+ if "rotary_emb.inv_freq" in name:
279
+ @@ -450,31 +424,41 @@
280
+ for (param_name, weight_name, shard_id) in stacked_params_mapping:
281
+ if weight_name not in name:
282
+ continue
283
+ + if ("mlp.experts." in name) and name not in params_dict:
284
+ + continue
285
+ name = name.replace(weight_name, param_name)
286
+ # Skip loading extra bias for GPTQ models.
287
+ if name.endswith(".bias") and name not in params_dict:
288
+ continue
289
+ - # Skip experts that are not assigned to this worker.
290
+ - if (("mlp.experts." in name or "mlp.shared_experts." in name)
291
+ - and name not in params_dict):
292
+ - continue
293
+ - if is_pp_missing_parameter(name, self):
294
+ - continue
295
+ param = params_dict[name]
296
+ weight_loader = param.weight_loader
297
+ weight_loader(param, loaded_weight, shard_id)
298
+ break
299
+ else:
300
+ - # Skip loading extra bias for GPTQ models.
301
+ - if name.endswith(".bias") and name not in params_dict:
302
+ - continue
303
+ - # Skip experts that are not assigned to this worker.
304
+ - if (("mlp.experts." in name or "mlp.shared_experts." in name)
305
+ - and name not in params_dict):
306
+ - continue
307
+ - if is_pp_missing_parameter(name, self):
308
+ - continue
309
+ - param = params_dict[name]
310
+ - weight_loader = getattr(param, "weight_loader",
311
+ - default_weight_loader)
312
+ - weight_loader(param, loaded_weight)
313
+ + for mapping in expert_params_mapping:
314
+ + param_name, weight_name, expert_id, shard_id = mapping
315
+ + if weight_name not in name:
316
+ + continue
317
+ + name = name.replace(weight_name, param_name)
318
+ + param = params_dict[name]
319
+ + weight_loader = param.weight_loader
320
+ + weight_loader(
321
+ + param,
322
+ + loaded_weight,
323
+ + name,
324
+ + shard_id=shard_id,
325
+ + expert_id=expert_id,
326
+ + )
327
+ + break
328
+ + else:
329
+ + # Skip loading extra bias for GPTQ models.
330
+ + if name.endswith(".bias") and name not in params_dict:
331
+ + continue
332
+ + # Skip experts that are not assigned to this worker.
333
+ + if ("mlp.experts." in name or "mlp.shared_experts."
334
+ + in name) and name not in params_dict:
335
+ + continue
336
+ + param = params_dict[name]
337
+ + weight_loader = getattr(param, "weight_loader",
338
+ + default_weight_loader)
339
+ + weight_loader(param, loaded_weight)
340
+
341
+ --- vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py 2025-04-03 11:17:01.781109069 +0800
342
+ +++ ling_vllm_patch_b.py 2025-04-02 20:54:38.521781433 +0800
343
+ @@ -245,7 +245,7 @@
344
+ config = self.quant_config.target_scheme_map["Linear"].get("weights")
345
+ self.num_bits = config.num_bits
346
+ self.packed_factor = 32 // config.num_bits
347
+ - self.strategy = config.strategy.value
348
+ + self.strategy = config.strategy
349
+ self.group_size = config.group_size
350
+ assert config.symmetric, (
351
+ "Only symmetric quantization is supported for MoE")