Spaces:

darpanaswal
/

Patent_Retrieval

Configuration error

darpanaswal commited on Apr 11

Commit

87f4272

verified ·

1 Parent(s): 3253735

Update cross_encoder_reranking_train.py

Files changed (1) hide show

cross_encoder_reranking_train.py CHANGED Viewed

@@ -134,13 +134,9 @@ def extract_text(content_dict, text_type="full"):
         for key, value in content_dict.items():
             if key.startswith('c-'):
                 content.append(value)
-            if key=="features":
-                content+=list(content_dict[key].values())
-        # for key, _ in content_dict.items
-        # for key, value in content_dict["features"]:
-        #     content.append(value)
         return " ".join(content)
     elif text_type == "tac1":
@@ -232,6 +228,13 @@ def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tenso
         batch_size = last_hidden_states.shape[0]
         return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
 def get_detailed_instruct(task_description: str, query: str) -> str:
     """Create an instruction-formatted query"""
     return f'Instruct: {task_description}\nQuery: {query}'
@@ -273,7 +276,7 @@ def cross_encoder_reranking(query_text, doc_texts, model, tokenizer, batch_size=
             # Get embeddings
             outputs = model(**batch_dict)
-            embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
             # Normalize embeddings
             embeddings = F.normalize(embeddings, p=2, dim=1)

         for key, value in content_dict.items():
             if key.startswith('c-'):
                 content.append(value)
+        for key, value in content_dict["features"]:
+            content.append(value)
         return " ".join(content)
     elif text_type == "tac1":
         batch_size = last_hidden_states.shape[0]
         return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
+def cls_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
+    """Extract [CLS] token representations, accounting for left padding."""
+    # Get the index of the first non-padding token in each sequence
+    cls_indices = attention_mask.float().argmax(dim=1)
+    batch_size = last_hidden_states.size(0)
+    return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), cls_indices]
 def get_detailed_instruct(task_description: str, query: str) -> str:
     """Create an instruction-formatted query"""
     return f'Instruct: {task_description}\nQuery: {query}'
             # Get embeddings
             outputs = model(**batch_dict)
+            embeddings = cls_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
             # Normalize embeddings
             embeddings = F.normalize(embeddings, p=2, dim=1)