yifan0sun commited on
Commit
9b48d92
·
verified ·
1 Parent(s): 6362f90

Update ROBERTAmodel.py

Browse files
Files changed (1) hide show
  1. ROBERTAmodel.py +54 -51
ROBERTAmodel.py CHANGED
@@ -5,9 +5,8 @@ from models import TransformerVisualizer
5
  from transformers import (
6
  RobertaForMaskedLM, RobertaForSequenceClassification
7
  )
8
- import os
9
- import time
10
-
11
 
12
  CACHE_DIR = "/data/hf_cache"
13
 
@@ -151,7 +150,6 @@ class RoBERTaVisualizer(TransformerVisualizer):
151
  def get_all_grad_attn_matrix(self, task, sentence, hypothesis='', maskID = None):
152
  print(task, sentence, hypothesis)
153
  print('Tokenize')
154
- start = time.time()
155
  if task == 'mnli':
156
  inputs = self.tokenizer(sentence, hypothesis, return_tensors='pt', padding=False, truncation=True)
157
  elif task == 'mlm':
@@ -163,91 +161,96 @@ class RoBERTaVisualizer(TransformerVisualizer):
163
  tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
164
  print(tokens)
165
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
166
- print(1,time.time()-start)
167
- start = time.time()
168
 
169
  print('Input embeddings with grad')
170
  embedding_layer = self.model.roberta.embeddings.word_embeddings
171
  inputs_embeds = embedding_layer(inputs["input_ids"])
172
  inputs_embeds.requires_grad_()
173
-
174
- print(2,time.time()-start)
175
- start = time.time()
176
  print('Forward pass')
177
  outputs = self.model.roberta(
178
  inputs_embeds=inputs_embeds,
179
  attention_mask=inputs["attention_mask"],
180
  output_attentions=True
181
  )
 
 
 
 
 
182
  attentions = outputs.attentions # list of [1, heads, seq, seq]
183
-
184
- print(3,time.time()-start)
185
- start = time.time()
186
  print('Average attentions per layer')
187
  mean_attns = [a.squeeze(0).mean(dim=0).detach().cpu() for a in attentions]
188
- print(4,time.time()-start)
189
- startloop = time.time()
190
- start = time.time()
191
 
192
  attn_matrices_all = []
193
  grad_matrices_all = []
194
  for target_layer in range(len(attentions)):
195
- print(5,target_layer, len(attentions), time.time()-start)
196
- start = time.time()
197
- grad_matrix, attn_matrix = self.get_grad_attn_matrix(inputs_embeds, attentions, mean_attns, target_layer)
198
- print(6,target_layer, len(attentions), time.time()-start)
199
- start = time.time()
200
- grad_matrices_all.append(grad_matrix.tolist())
201
  attn_matrices_all.append(attn_matrix.tolist())
202
- print(7,target_layer, len(attentions), time.time()-start)
 
 
203
  start = time.time()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
- print(8,time.time()-startloop)
206
  return grad_matrices_all, attn_matrices_all
207
 
208
  def get_grad_attn_matrix(self,inputs_embeds, attentions, mean_attns, target_layer):
209
 
210
- start = time.time()
211
  attn_matrix = mean_attns[target_layer]
212
  seq_len = attn_matrix.shape[0]
213
-
214
- attn_matrix = torch.round(attn_matrix.float() * 100) / 100
215
- attn_matrix = attn_matrix.to(torch.float16)
216
-
217
  attn_layer = attentions[target_layer].squeeze(0).mean(dim=0) # [seq, seq]
218
- print(9,time.time()-start)
219
- start = time.time()
220
- #print('Computing grad norms')
221
  grad_norms_list = []
 
222
  for k in range(seq_len):
223
  scalar = attn_layer[:, k].sum()
224
- grad = torch.autograd.grad(scalar, inputs_embeds, retain_graph=True)[0].squeeze(0)
225
- grad_norms = grad.norm(dim=1)
226
-
227
 
228
- grad_norms = torch.round(grad_norms.unsqueeze(1).float() * 100) / 100
229
- grad_norms = grad_norms.to(torch.float16)
230
 
 
 
231
 
232
- grad_norms_list.append(grad_norms)
233
-
234
- print(10,time.time()-start)
235
- start = time.time()
236
  grad_matrix = torch.cat(grad_norms_list, dim=1)
237
- grad_matrix = grad_matrix[:seq_len, :seq_len]
238
- attn_matrix = attn_matrix[:seq_len, :seq_len]
239
-
240
- print(11,time.time()-start)
241
- start = time.time()
242
 
243
- attn_matrix = torch.round(attn_matrix.float() * 100) / 100
244
- attn_matrix = attn_matrix.to(torch.float16)
245
 
246
- grad_matrix = torch.round(grad_matrix.float() * 100) / 100
247
- grad_matrix = grad_matrix.to(torch.float16)
248
- print(12,time.time()-start)
249
 
250
-
251
 
 
 
 
 
252
 
253
  return grad_matrix, attn_matrix
 
5
  from transformers import (
6
  RobertaForMaskedLM, RobertaForSequenceClassification
7
  )
8
+ import os,time
9
+ import torch.autograd.functional as Fgrad
 
10
 
11
  CACHE_DIR = "/data/hf_cache"
12
 
 
150
  def get_all_grad_attn_matrix(self, task, sentence, hypothesis='', maskID = None):
151
  print(task, sentence, hypothesis)
152
  print('Tokenize')
 
153
  if task == 'mnli':
154
  inputs = self.tokenizer(sentence, hypothesis, return_tensors='pt', padding=False, truncation=True)
155
  elif task == 'mlm':
 
161
  tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
162
  print(tokens)
163
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
 
 
164
 
165
  print('Input embeddings with grad')
166
  embedding_layer = self.model.roberta.embeddings.word_embeddings
167
  inputs_embeds = embedding_layer(inputs["input_ids"])
168
  inputs_embeds.requires_grad_()
169
+
 
 
170
  print('Forward pass')
171
  outputs = self.model.roberta(
172
  inputs_embeds=inputs_embeds,
173
  attention_mask=inputs["attention_mask"],
174
  output_attentions=True
175
  )
176
+
177
+
178
+
179
+
180
+
181
  attentions = outputs.attentions # list of [1, heads, seq, seq]
182
+
 
 
183
  print('Average attentions per layer')
184
  mean_attns = [a.squeeze(0).mean(dim=0).detach().cpu() for a in attentions]
 
 
 
185
 
186
  attn_matrices_all = []
187
  grad_matrices_all = []
188
  for target_layer in range(len(attentions)):
189
+ #grad_matrix, attn_matrix = self.get_grad_attn_matrix(inputs_embeds, attentions, mean_attns, target_layer)
190
+
191
+ attn_matrix = mean_attns[target_layer]
192
+ seq_len = attn_matrix.shape[0]
193
+ attn_matrix = attn_matrix[:seq_len, :seq_len]
 
194
  attn_matrices_all.append(attn_matrix.tolist())
195
+
196
+
197
+
198
  start = time.time()
199
+ def scalar_outputs(inputs_embeds):
200
+
201
+ outputs = self.model.roberta(
202
+ inputs_embeds=inputs_embeds,
203
+ attention_mask=inputs["attention_mask"],
204
+ output_attentions=True
205
+ )
206
+ attentions = outputs.attentions
207
+ return attentions[target_layer].mean(dim=0).mean(dim=0).sum(dim=0)
208
+
209
+ jac = torch.autograd.functional.jacobian(scalar_outputs, inputs_embeds).norm(dim=-1).squeeze(1)
210
+
211
+ grad_matrices_all.append(jac.tolist())
212
+ print(1,time.time()-start)
213
+
214
+ start = time.time()
215
+ grad_norms_list = []
216
+
217
+ for k in range(seq_len):
218
+ scalar = attentions[target_layer].mean(dim=0).mean(dim=0)
219
+ scalar = scalar[:, k].sum()
220
+
221
+ grad = torch.autograd.grad(scalar, inputs_embeds, retain_graph=True)[0].squeeze(0)
222
+
223
+ grad_norms = grad.norm(dim=1)
224
+ grad_norms_list.append(grad_norms.unsqueeze(1))
225
+ print(2,time.time()-start)
226
 
 
227
  return grad_matrices_all, attn_matrices_all
228
 
229
  def get_grad_attn_matrix(self,inputs_embeds, attentions, mean_attns, target_layer):
230
 
 
231
  attn_matrix = mean_attns[target_layer]
232
  seq_len = attn_matrix.shape[0]
 
 
 
 
233
  attn_layer = attentions[target_layer].squeeze(0).mean(dim=0) # [seq, seq]
234
+ """
235
+ print('Computing grad norms')
 
236
  grad_norms_list = []
237
+
238
  for k in range(seq_len):
239
  scalar = attn_layer[:, k].sum()
 
 
 
240
 
241
+ grad = torch.autograd.grad(scalar, inputs_embeds, retain_graph=True)[0].squeeze(0)
 
242
 
243
+ grad_norms = grad.norm(dim=1)
244
+ grad_norms_list.append(grad_norms.unsqueeze(1))
245
 
 
 
 
 
246
  grad_matrix = torch.cat(grad_norms_list, dim=1)
 
 
 
 
 
247
 
 
 
248
 
 
 
 
249
 
 
250
 
251
+ grad_matrix = grad_matrix[:seq_len, :seq_len]
252
+ """
253
+ attn_matrix = attn_matrix[:seq_len, :seq_len]
254
+ grad_matrix = attn_matrix
255
 
256
  return grad_matrix, attn_matrix