Spaces:
Running
on
T4
Running
on
T4
Update ROBERTAmodel.py
Browse files- ROBERTAmodel.py +54 -51
ROBERTAmodel.py
CHANGED
@@ -5,9 +5,8 @@ from models import TransformerVisualizer
|
|
5 |
from transformers import (
|
6 |
RobertaForMaskedLM, RobertaForSequenceClassification
|
7 |
)
|
8 |
-
import os
|
9 |
-
import
|
10 |
-
|
11 |
|
12 |
CACHE_DIR = "/data/hf_cache"
|
13 |
|
@@ -151,7 +150,6 @@ class RoBERTaVisualizer(TransformerVisualizer):
|
|
151 |
def get_all_grad_attn_matrix(self, task, sentence, hypothesis='', maskID = None):
|
152 |
print(task, sentence, hypothesis)
|
153 |
print('Tokenize')
|
154 |
-
start = time.time()
|
155 |
if task == 'mnli':
|
156 |
inputs = self.tokenizer(sentence, hypothesis, return_tensors='pt', padding=False, truncation=True)
|
157 |
elif task == 'mlm':
|
@@ -163,91 +161,96 @@ class RoBERTaVisualizer(TransformerVisualizer):
|
|
163 |
tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
164 |
print(tokens)
|
165 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
166 |
-
print(1,time.time()-start)
|
167 |
-
start = time.time()
|
168 |
|
169 |
print('Input embeddings with grad')
|
170 |
embedding_layer = self.model.roberta.embeddings.word_embeddings
|
171 |
inputs_embeds = embedding_layer(inputs["input_ids"])
|
172 |
inputs_embeds.requires_grad_()
|
173 |
-
|
174 |
-
print(2,time.time()-start)
|
175 |
-
start = time.time()
|
176 |
print('Forward pass')
|
177 |
outputs = self.model.roberta(
|
178 |
inputs_embeds=inputs_embeds,
|
179 |
attention_mask=inputs["attention_mask"],
|
180 |
output_attentions=True
|
181 |
)
|
|
|
|
|
|
|
|
|
|
|
182 |
attentions = outputs.attentions # list of [1, heads, seq, seq]
|
183 |
-
|
184 |
-
print(3,time.time()-start)
|
185 |
-
start = time.time()
|
186 |
print('Average attentions per layer')
|
187 |
mean_attns = [a.squeeze(0).mean(dim=0).detach().cpu() for a in attentions]
|
188 |
-
print(4,time.time()-start)
|
189 |
-
startloop = time.time()
|
190 |
-
start = time.time()
|
191 |
|
192 |
attn_matrices_all = []
|
193 |
grad_matrices_all = []
|
194 |
for target_layer in range(len(attentions)):
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
grad_matrices_all.append(grad_matrix.tolist())
|
201 |
attn_matrices_all.append(attn_matrix.tolist())
|
202 |
-
|
|
|
|
|
203 |
start = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
-
print(8,time.time()-startloop)
|
206 |
return grad_matrices_all, attn_matrices_all
|
207 |
|
208 |
def get_grad_attn_matrix(self,inputs_embeds, attentions, mean_attns, target_layer):
|
209 |
|
210 |
-
start = time.time()
|
211 |
attn_matrix = mean_attns[target_layer]
|
212 |
seq_len = attn_matrix.shape[0]
|
213 |
-
|
214 |
-
attn_matrix = torch.round(attn_matrix.float() * 100) / 100
|
215 |
-
attn_matrix = attn_matrix.to(torch.float16)
|
216 |
-
|
217 |
attn_layer = attentions[target_layer].squeeze(0).mean(dim=0) # [seq, seq]
|
218 |
-
|
219 |
-
|
220 |
-
#print('Computing grad norms')
|
221 |
grad_norms_list = []
|
|
|
222 |
for k in range(seq_len):
|
223 |
scalar = attn_layer[:, k].sum()
|
224 |
-
grad = torch.autograd.grad(scalar, inputs_embeds, retain_graph=True)[0].squeeze(0)
|
225 |
-
grad_norms = grad.norm(dim=1)
|
226 |
-
|
227 |
|
228 |
-
|
229 |
-
grad_norms = grad_norms.to(torch.float16)
|
230 |
|
|
|
|
|
231 |
|
232 |
-
grad_norms_list.append(grad_norms)
|
233 |
-
|
234 |
-
print(10,time.time()-start)
|
235 |
-
start = time.time()
|
236 |
grad_matrix = torch.cat(grad_norms_list, dim=1)
|
237 |
-
grad_matrix = grad_matrix[:seq_len, :seq_len]
|
238 |
-
attn_matrix = attn_matrix[:seq_len, :seq_len]
|
239 |
-
|
240 |
-
print(11,time.time()-start)
|
241 |
-
start = time.time()
|
242 |
|
243 |
-
attn_matrix = torch.round(attn_matrix.float() * 100) / 100
|
244 |
-
attn_matrix = attn_matrix.to(torch.float16)
|
245 |
|
246 |
-
grad_matrix = torch.round(grad_matrix.float() * 100) / 100
|
247 |
-
grad_matrix = grad_matrix.to(torch.float16)
|
248 |
-
print(12,time.time()-start)
|
249 |
|
250 |
-
|
251 |
|
|
|
|
|
|
|
|
|
252 |
|
253 |
return grad_matrix, attn_matrix
|
|
|
5 |
from transformers import (
|
6 |
RobertaForMaskedLM, RobertaForSequenceClassification
|
7 |
)
|
8 |
+
import os,time
|
9 |
+
import torch.autograd.functional as Fgrad
|
|
|
10 |
|
11 |
CACHE_DIR = "/data/hf_cache"
|
12 |
|
|
|
150 |
def get_all_grad_attn_matrix(self, task, sentence, hypothesis='', maskID = None):
|
151 |
print(task, sentence, hypothesis)
|
152 |
print('Tokenize')
|
|
|
153 |
if task == 'mnli':
|
154 |
inputs = self.tokenizer(sentence, hypothesis, return_tensors='pt', padding=False, truncation=True)
|
155 |
elif task == 'mlm':
|
|
|
161 |
tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
|
162 |
print(tokens)
|
163 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
|
|
|
|
164 |
|
165 |
print('Input embeddings with grad')
|
166 |
embedding_layer = self.model.roberta.embeddings.word_embeddings
|
167 |
inputs_embeds = embedding_layer(inputs["input_ids"])
|
168 |
inputs_embeds.requires_grad_()
|
169 |
+
|
|
|
|
|
170 |
print('Forward pass')
|
171 |
outputs = self.model.roberta(
|
172 |
inputs_embeds=inputs_embeds,
|
173 |
attention_mask=inputs["attention_mask"],
|
174 |
output_attentions=True
|
175 |
)
|
176 |
+
|
177 |
+
|
178 |
+
|
179 |
+
|
180 |
+
|
181 |
attentions = outputs.attentions # list of [1, heads, seq, seq]
|
182 |
+
|
|
|
|
|
183 |
print('Average attentions per layer')
|
184 |
mean_attns = [a.squeeze(0).mean(dim=0).detach().cpu() for a in attentions]
|
|
|
|
|
|
|
185 |
|
186 |
attn_matrices_all = []
|
187 |
grad_matrices_all = []
|
188 |
for target_layer in range(len(attentions)):
|
189 |
+
#grad_matrix, attn_matrix = self.get_grad_attn_matrix(inputs_embeds, attentions, mean_attns, target_layer)
|
190 |
+
|
191 |
+
attn_matrix = mean_attns[target_layer]
|
192 |
+
seq_len = attn_matrix.shape[0]
|
193 |
+
attn_matrix = attn_matrix[:seq_len, :seq_len]
|
|
|
194 |
attn_matrices_all.append(attn_matrix.tolist())
|
195 |
+
|
196 |
+
|
197 |
+
|
198 |
start = time.time()
|
199 |
+
def scalar_outputs(inputs_embeds):
|
200 |
+
|
201 |
+
outputs = self.model.roberta(
|
202 |
+
inputs_embeds=inputs_embeds,
|
203 |
+
attention_mask=inputs["attention_mask"],
|
204 |
+
output_attentions=True
|
205 |
+
)
|
206 |
+
attentions = outputs.attentions
|
207 |
+
return attentions[target_layer].mean(dim=0).mean(dim=0).sum(dim=0)
|
208 |
+
|
209 |
+
jac = torch.autograd.functional.jacobian(scalar_outputs, inputs_embeds).norm(dim=-1).squeeze(1)
|
210 |
+
|
211 |
+
grad_matrices_all.append(jac.tolist())
|
212 |
+
print(1,time.time()-start)
|
213 |
+
|
214 |
+
start = time.time()
|
215 |
+
grad_norms_list = []
|
216 |
+
|
217 |
+
for k in range(seq_len):
|
218 |
+
scalar = attentions[target_layer].mean(dim=0).mean(dim=0)
|
219 |
+
scalar = scalar[:, k].sum()
|
220 |
+
|
221 |
+
grad = torch.autograd.grad(scalar, inputs_embeds, retain_graph=True)[0].squeeze(0)
|
222 |
+
|
223 |
+
grad_norms = grad.norm(dim=1)
|
224 |
+
grad_norms_list.append(grad_norms.unsqueeze(1))
|
225 |
+
print(2,time.time()-start)
|
226 |
|
|
|
227 |
return grad_matrices_all, attn_matrices_all
|
228 |
|
229 |
def get_grad_attn_matrix(self,inputs_embeds, attentions, mean_attns, target_layer):
|
230 |
|
|
|
231 |
attn_matrix = mean_attns[target_layer]
|
232 |
seq_len = attn_matrix.shape[0]
|
|
|
|
|
|
|
|
|
233 |
attn_layer = attentions[target_layer].squeeze(0).mean(dim=0) # [seq, seq]
|
234 |
+
"""
|
235 |
+
print('Computing grad norms')
|
|
|
236 |
grad_norms_list = []
|
237 |
+
|
238 |
for k in range(seq_len):
|
239 |
scalar = attn_layer[:, k].sum()
|
|
|
|
|
|
|
240 |
|
241 |
+
grad = torch.autograd.grad(scalar, inputs_embeds, retain_graph=True)[0].squeeze(0)
|
|
|
242 |
|
243 |
+
grad_norms = grad.norm(dim=1)
|
244 |
+
grad_norms_list.append(grad_norms.unsqueeze(1))
|
245 |
|
|
|
|
|
|
|
|
|
246 |
grad_matrix = torch.cat(grad_norms_list, dim=1)
|
|
|
|
|
|
|
|
|
|
|
247 |
|
|
|
|
|
248 |
|
|
|
|
|
|
|
249 |
|
|
|
250 |
|
251 |
+
grad_matrix = grad_matrix[:seq_len, :seq_len]
|
252 |
+
"""
|
253 |
+
attn_matrix = attn_matrix[:seq_len, :seq_len]
|
254 |
+
grad_matrix = attn_matrix
|
255 |
|
256 |
return grad_matrix, attn_matrix
|