Spaces:

aynetdia
/

semscore

Running

App Files Files Community

aynetdia commited on Aug 18

Commit

c337da5

1 Parent(s): 5b86628

add support for last token pooling

Browse files

Files changed (1) hide show

semscore.py +20 -3

semscore.py CHANGED Viewed

@@ -87,6 +87,7 @@ class SemScore(evaluate.Metric):
         # Load model and tokenizer from HuggingFace Hub
         self.model = AutoModel.from_pretrained(checkpoint)
         self.model.eval()
         self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
     @staticmethod
@@ -95,6 +96,16 @@ class SemScore(evaluate.Metric):
         token_embeddings = model_output[0]
         input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
         return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
     def _compute(
         self,
@@ -102,10 +113,12 @@ class SemScore(evaluate.Metric):
         references,
         batch_size=32,
         device=None,
         ):
         """Returns the scores"""
         assert len(predictions) == len(references), "predictions and references should have the same length."
         if device is not None:
             if "cuda" in device:
                 assert torch.cuda.is_available()
@@ -123,8 +136,12 @@ class SemScore(evaluate.Metric):
                 encoded_preds = self.tokenizer(batch_preds, padding=True, truncation=True, return_tensors='pt')
                 model_output_refs = self.model(**encoded_refs.to(device))
                 model_output_preds = self.model(**encoded_preds.to(device))
-                batch_pooled_refs = self._mean_pooling(model_output_refs, encoded_refs['attention_mask'])
-                batch_pooled_preds = self._mean_pooling(model_output_preds, encoded_preds['attention_mask'])
                 pooled_refs.append(batch_pooled_refs)
                 pooled_preds.append(batch_pooled_preds)
         pooled_refs, pooled_preds = torch.cat(pooled_refs), torch.cat(pooled_preds)
@@ -136,4 +153,4 @@ class SemScore(evaluate.Metric):
         return {
             "semscore": round(semscore.item(), 2),
             "similarities": similarities.tolist()
-        }

         # Load model and tokenizer from HuggingFace Hub
         self.model = AutoModel.from_pretrained(checkpoint)
         self.model.eval()
+        padding_side = "left" if self
         self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
     @staticmethod
         token_embeddings = model_output[0]
         input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
         return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    @staticmethod
+    def _last_token_pooling(last_hidden_states, attention_mask):
+        left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+        if left_padding:
+            return last_hidden_states[:, -1]
+        else:
+            sequence_lengths = attention_mask.sum(dim=1) - 1
+            batch_size = last_hidden_states.shape[0]
+            return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
     def _compute(
         self,
         references,
         batch_size=32,
         device=None,
+        pooling="mean"
         ):
         """Returns the scores"""
         assert len(predictions) == len(references), "predictions and references should have the same length."
+        assert pooling in ["mean", "last"]
         if device is not None:
             if "cuda" in device:
                 assert torch.cuda.is_available()
                 encoded_preds = self.tokenizer(batch_preds, padding=True, truncation=True, return_tensors='pt')
                 model_output_refs = self.model(**encoded_refs.to(device))
                 model_output_preds = self.model(**encoded_preds.to(device))
+                if pooling == "mean":
+                    batch_pooled_refs = self._mean_pooling(model_output_refs, encoded_refs['attention_mask'])
+                    batch_pooled_preds = self._mean_pooling(model_output_preds, encoded_preds['attention_mask'])
+                elif pooling == "last":
+                    batch_pooled_refs = self._last_token_pooling(model_output_refs, encoded_refs['attention_mask'])
+                    batch_pooled_preds = self._last_token_pooling(model_output_preds, encoded_preds['attention_mask'])
                 pooled_refs.append(batch_pooled_refs)
                 pooled_preds.append(batch_pooled_preds)
         pooled_refs, pooled_preds = torch.cat(pooled_refs), torch.cat(pooled_preds)
         return {
             "semscore": round(semscore.item(), 2),
             "similarities": similarities.tolist()
+        }