Spaces:

SiddharthAK
/

TextLSRDemo

Running

App Files Files Community

SiddharthAK commited on 21 days ago

Commit

7c4de94

verified ·

1 Parent(s): ab88097

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -6

app.py CHANGED Viewed

@@ -29,6 +29,34 @@ except Exception as e:
     print(f"Error loading SPLADE v3 Lexical model: {e}")
     print(f"Please ensure '{splade_lexical_model_name}' is accessible (check Hugging Face Hub for potential agreements).")
 # --- Core Representation Functions ---
@@ -79,7 +107,7 @@ def get_splade_representation(text):
     return formatted_output
-def get_splade_lexical_representation(text):
     if tokenizer_splade_lexical is None or model_splade_lexical is None:
         return "SPLADE v3 Lexical model is not loaded. Please check the console for loading errors."
@@ -97,6 +125,20 @@ def get_splade_lexical_representation(text):
     else:
         return "Model output structure not as expected for SPLADE v3 Lexical. 'logits' not found."
     indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
         indices = [indices]
@@ -130,8 +172,12 @@ def get_splade_lexical_representation(text):
 def predict_representation(model_choice, text):
     if model_choice == "SPLADE (cocondenser)":
         return get_splade_representation(text)
-    elif model_choice == "SPLADE-v3-Lexical":
-        return get_splade_lexical_representation(text)
     else:
         return "Please select a model."
@@ -140,7 +186,11 @@ demo = gr.Interface(
     fn=predict_representation,
     inputs=[
         gr.Radio(
-            ["SPLADE (cocondenser)", "SPLADE-v3-Lexical"], # Updated options
             label="Choose Representation Model",
             value="SPLADE (cocondenser)" # Default selection
         ),
@@ -151,8 +201,8 @@ demo = gr.Interface(
         )
     ],
     outputs=gr.Markdown(),
-    title="🌌 Sparse and Binary Sparse Representation Generator",
-    description="Enter any text to see its SPLADE sparse vector or SPLADE-v3-Lexical representation.",
     allow_flagging="never"
 )

     print(f"Error loading SPLADE v3 Lexical model: {e}")
     print(f"Please ensure '{splade_lexical_model_name}' is accessible (check Hugging Face Hub for potential agreements).")
+# --- Helper function for lexical mask ---
+def create_lexical_bow_mask(input_ids, vocab_size, tokenizer):
+    """
+    Creates a binary bag-of-words mask from input_ids,
+    zeroing out special tokens and padding.
+    """
+    # Initialize a zero vector for the entire vocabulary
+    bow_mask = torch.zeros(vocab_size, device=input_ids.device)
+    # Get unique token IDs from the input, excluding special tokens
+    # input_ids is typically [batch_size, seq_len], we assume batch_size=1
+    meaningful_token_ids = []
+    for token_id in input_ids.squeeze().tolist(): # Squeeze to remove batch dim and convert to list
+        if token_id not in [
+            tokenizer.pad_token_id,
+            tokenizer.cls_token_id,
+            tokenizer.sep_token_id,
+            tokenizer.mask_token_id,
+            tokenizer.unk_token_id # Also exclude unknown tokens
+        ]:
+            meaningful_token_ids.append(token_id)
+    # Set 1 for tokens present in the original input
+    if meaningful_token_ids:
+        bow_mask[list(set(meaningful_token_ids))] = 1 # Use set to handle duplicates
+    return bow_mask.unsqueeze(0) # Keep batch dimension for consistency
 # --- Core Representation Functions ---
     return formatted_output
+def get_splade_lexical_representation(text, apply_lexical_mask: bool): # Added parameter
     if tokenizer_splade_lexical is None or model_splade_lexical is None:
         return "SPLADE v3 Lexical model is not loaded. Please check the console for loading errors."
     else:
         return "Model output structure not as expected for SPLADE v3 Lexical. 'logits' not found."
+    # --- Apply Lexical Mask if requested ---
+    if apply_lexical_mask:
+        # Get the vocabulary size from the tokenizer
+        vocab_size = tokenizer_splade_lexical.vocab_size
+        # Create the Bag-of-Words mask
+        bow_mask = create_lexical_bow_mask(
+            inputs['input_ids'], vocab_size, tokenizer_splade_lexical
+        ).squeeze() # Squeeze to match splade_vector's [vocab_size] shape
+        # Multiply the SPLADE vector by the BoW mask to zero out expanded terms
+        splade_vector = splade_vector * bow_mask
+    # --- End Lexical Mask Logic ---
     indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
         indices = [indices]
 def predict_representation(model_choice, text):
     if model_choice == "SPLADE (cocondenser)":
         return get_splade_representation(text)
+    elif model_choice == "SPLADE-v3-Lexical (with expansion)":
+        # Call the lexical function without applying the mask
+        return get_splade_lexical_representation(text, apply_lexical_mask=False)
+    elif model_choice == "SPLADE-v3-Lexical (lexical-only)":
+        # Call the lexical function applying the mask
+        return get_splade_lexical_representation(text, apply_lexical_mask=True)
     else:
         return "Please select a model."
     fn=predict_representation,
     inputs=[
         gr.Radio(
+            [
+                "SPLADE (cocondenser)",
+                "SPLADE-v3-Lexical (with expansion)", # Option to see full neural output
+                "SPLADE-v3-Lexical (lexical-only)"    # Option with lexical mask applied
+            ],
             label="Choose Representation Model",
             value="SPLADE (cocondenser)" # Default selection
         ),
         )
     ],
     outputs=gr.Markdown(),
+    title="🌌 Sparse Representation Generator",
+    description="Enter any text to see its SPLADE sparse vector. Explore the difference between full neural expansion and lexical-only representations.",
     allow_flagging="never"
 )