Spaces:

SiddharthAK
/

TextLSRDemo

Running

App Files Files Community

SiddharthAK commited on 18 days ago

Commit

b7f6be7

verified ·

1 Parent(s): f384e43

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -37

app.py CHANGED Viewed

@@ -2,8 +2,8 @@ import gradio as gr
 from transformers import AutoTokenizer, AutoModelForMaskedLM
 import torch
 import numpy as np
-from tqdm.auto import tqdm # Still useful for model loading progress if desired, but not strictly necessary for this simplified version
-import os # Still useful for general purpose, but not explicitly used in this simplified version
 # --- Model Loading ---
 tokenizer_splade = None
@@ -34,11 +34,11 @@ except Exception as e:
     print(f"Error loading SPLADE-v3-Lexical model: {e}")
     print(f"Please ensure '{splade_lexical_model_name}' is accessible (check Hugging Face Hub for potential agreements).")
-# Load SPLADE v3 Doc model
 try:
     splade_doc_model_name = "naver/splade-v3-doc"
     tokenizer_splade_doc = AutoTokenizer.from_pretrained(splade_doc_model_name)
-    model_splade_doc = AutoModelForMaskedLM.from_pretrained(splade_doc_model_name)
     model_splade_doc.eval()
     print(f"SPLADE-v3-Doc model '{splade_doc_model_name}' loaded successfully!")
 except Exception as e:
@@ -183,25 +183,19 @@ def get_splade_lexical_representation(text):
 def get_splade_doc_representation(text):
-    if tokenizer_splade_doc is None or model_splade_doc is None:
-        return "SPLADE-v3-Doc model is not loaded. Please check the console for loading errors."
     inputs = tokenizer_splade_doc(text, return_tensors="pt", padding=True, truncation=True)
-    inputs = {k: v.to(model_splade_doc.device) for k, v in inputs.items()}
-    with torch.no_grad():
-        output = model_splade_doc(**inputs)
-    if not hasattr(output, "logits"):
-        return "Model output structure not as expected. 'logits' not found."
     vocab_size = tokenizer_splade_doc.vocab_size
-    # Call with unsqueezed input_ids for single sample processing
-    binary_splade_vector = create_lexical_bow_mask(
         inputs['input_ids'], vocab_size, tokenizer_splade_doc
     ).squeeze() # Squeeze back for single output
-    indices = torch.nonzero(binary_splade_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
         indices = [indices] if indices else []
@@ -216,7 +210,7 @@ def get_splade_doc_representation(text):
     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
-    formatted_output = "SPLADE-v3-Doc Representation (Binary):\n"
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
@@ -226,7 +220,7 @@ def get_splade_doc_representation(text):
                 break
             formatted_output += f"- **{term}**\n"
-    formatted_output += "\n--- Raw Binary Sparse Vector Info ---\n"
     formatted_output += f"Total activated terms: {len(indices)}\n"
     formatted_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_doc.vocab_size):.2%}\n"
@@ -235,11 +229,11 @@ def get_splade_doc_representation(text):
 # --- Unified Prediction Function for the Explorer Tab ---
 def predict_representation_explorer(model_choice, text):
-    if model_choice == "SPLADE-cocondenser-distil (weighting and expansion)":
         return get_splade_cocondenser_representation(text)
     elif model_choice == "MLP encoder (SPLADE-v3-lexical)":
         return get_splade_lexical_representation(text)
-    elif model_choice == "Binary encoder":
         return get_splade_doc_representation(text)
     else:
         return "Please select a model."
@@ -290,22 +284,18 @@ def get_splade_lexical_vector(text):
     return None
 def get_splade_doc_vector(text):
-    if tokenizer_splade_doc is None or model_splade_doc is None:
         return None
     inputs = tokenizer_splade_doc(text, return_tensors="pt", padding=True, truncation=True)
-    inputs = {k: v.to(model_splade_doc.device) for k, v in inputs.items()}
-    with torch.no_grad():
-        output = model_splade_doc(**inputs)
-    if hasattr(output, "logits"):
-        vocab_size = tokenizer_splade_doc.vocab_size
-        binary_splade_vector = create_lexical_bow_mask(
-            inputs['input_ids'], vocab_size, tokenizer_splade_doc
-        ).squeeze()
-        return binary_splade_vector
-    return None
 # --- Function to get formatted representation from a raw vector and tokenizer ---
@@ -322,7 +312,7 @@ def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
         values = [1.0] * len(indices)
     else:
         values = splade_vector[indices].cpu().tolist()
     token_weights = dict(zip(indices, values))
     meaningful_tokens = {}
@@ -361,8 +351,8 @@ def get_model_assets(model_choice_str):
         return get_splade_cocondenser_vector, tokenizer_splade, False, "MLM encoder (SPLADE-cocondenser-distil)"
     elif model_choice_str == "MLP encoder (SPLADE-v3-lexical)":
         return get_splade_lexical_vector, tokenizer_splade_lexical, False, "MLP encoder (SPLADE-v3-lexical)"
-    elif model_choice_str == "Binary encoder":
-        return get_splade_doc_vector, tokenizer_splade_doc, True, "Binary encoder"
     else:
         return None, None, False, "Unknown Model"
@@ -415,7 +405,7 @@ with gr.Blocks(title="SPLADE Demos") as demo:
                         [
                             "MLM encoder (SPLADE-cocondenser-distil)",
                             "MLP encoder (SPLADE-v3-lexical)",
-                            "Binary Encoder"
                         ],
                         label="Choose Sparse Encoder",
                         value="MLM encoder (SPLADE-cocondenser-distil)"
@@ -439,7 +429,7 @@ with gr.Blocks(title="SPLADE Demos") as demo:
             model_choices = [
                 "MLM encoder (SPLADE-cocondenser-distil)",
                 "MLP encoder (SPLADE-v3-lexical)",
-                "Binary encoder"
             ]
             gr.Interface(

 from transformers import AutoTokenizer, AutoModelForMaskedLM
 import torch
 import numpy as np
+from tqdm.auto import tqdm
+import os
 # --- Model Loading ---
 tokenizer_splade = None
     print(f"Error loading SPLADE-v3-Lexical model: {e}")
     print(f"Please ensure '{splade_lexical_model_name}' is accessible (check Hugging Face Hub for potential agreements).")
+# Load SPLADE v3 Doc model - Model loading is still necessary even if its logits aren't used for BoW
 try:
     splade_doc_model_name = "naver/splade-v3-doc"
     tokenizer_splade_doc = AutoTokenizer.from_pretrained(splade_doc_model_name)
+    model_splade_doc = AutoModelForMaskedLM.from_pretrained(splade_doc_model_name) # Still load the model
     model_splade_doc.eval()
     print(f"SPLADE-v3-Doc model '{splade_doc_model_name}' loaded successfully!")
 except Exception as e:
 def get_splade_doc_representation(text):
+    if tokenizer_splade_doc is None: # No longer need model_splade_doc to be loaded for 'logits'
+        return "SPLADE-v3-Doc tokenizer is not loaded. Please check the console for loading errors."
     inputs = tokenizer_splade_doc(text, return_tensors="pt", padding=True, truncation=True)
+    inputs = {k: v.to(torch.device("cpu")) for k, v in inputs.items()} # Ensure on CPU for direct mask creation
     vocab_size = tokenizer_splade_doc.vocab_size
+    # Directly create the binary Bag-of-Words vector using the input_ids
+    binary_bow_vector = create_lexical_bow_mask(
         inputs['input_ids'], vocab_size, tokenizer_splade_doc
     ).squeeze() # Squeeze back for single output
+    indices = torch.nonzero(binary_bow_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
         indices = [indices] if indices else []
     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
+    formatted_output = "Binary Bag-of-Words Representation:\n" # Changed title
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
                 break
             formatted_output += f"- **{term}**\n"
+    formatted_output += "\n--- Raw Binary Bag-of-Words Vector Info ---\n" # Changed title
     formatted_output += f"Total activated terms: {len(indices)}\n"
     formatted_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_doc.vocab_size):.2%}\n"
 # --- Unified Prediction Function for the Explorer Tab ---
 def predict_representation_explorer(model_choice, text):
+    if model_choice == "MLM encoder (SPLADE-cocondenser-distil)":
         return get_splade_cocondenser_representation(text)
     elif model_choice == "MLP encoder (SPLADE-v3-lexical)":
         return get_splade_lexical_representation(text)
+    elif model_choice == "Binary Bag-of-Words": # Changed name
         return get_splade_doc_representation(text)
     else:
         return "Please select a model."
     return None
 def get_splade_doc_vector(text):
+    if tokenizer_splade_doc is None: # No longer need model_splade_doc to be loaded for 'logits'
         return None
     inputs = tokenizer_splade_doc(text, return_tensors="pt", padding=True, truncation=True)
+    inputs = {k: v.to(torch.device("cpu")) for k, v in inputs.items()} # Ensure on CPU for direct mask creation
+    vocab_size = tokenizer_splade_doc.vocab_size
+    # Directly create the binary Bag-of-Words vector using the input_ids
+    binary_bow_vector = create_lexical_bow_mask(
+        inputs['input_ids'], vocab_size, tokenizer_splade_doc
+    ).squeeze()
+    return binary_bow_vector
 # --- Function to get formatted representation from a raw vector and tokenizer ---
         values = [1.0] * len(indices)
     else:
         values = splade_vector[indices].cpu().tolist()
     token_weights = dict(zip(indices, values))
     meaningful_tokens = {}
         return get_splade_cocondenser_vector, tokenizer_splade, False, "MLM encoder (SPLADE-cocondenser-distil)"
     elif model_choice_str == "MLP encoder (SPLADE-v3-lexical)":
         return get_splade_lexical_vector, tokenizer_splade_lexical, False, "MLP encoder (SPLADE-v3-lexical)"
+    elif model_choice_str == "Binary Bag-of-Words": # Changed name
+        return get_splade_doc_vector, tokenizer_splade_doc, True, "Binary Bag-of-Words" # Changed name
     else:
         return None, None, False, "Unknown Model"
                         [
                             "MLM encoder (SPLADE-cocondenser-distil)",
                             "MLP encoder (SPLADE-v3-lexical)",
+                            "Binary Bag-of-Words" # Changed name here
                         ],
                         label="Choose Sparse Encoder",
                         value="MLM encoder (SPLADE-cocondenser-distil)"
             model_choices = [
                 "MLM encoder (SPLADE-cocondenser-distil)",
                 "MLP encoder (SPLADE-v3-lexical)",
+                "Binary Bag-of-Words" # Changed name here
             ]
             gr.Interface(