Spaces:

SiddharthAK
/

TextLSRDemo

Running

App Files Files Community

SiddharthAK commited on 19 days ago

Commit

da3acda

verified ·

1 Parent(s): 1728ea0

added unicoil

Browse files

Files changed (1) hide show

app.py +197 -72

app.py CHANGED Viewed

@@ -1,119 +1,244 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForMaskedLM
 import torch
 # --- Model Loading ---
-# Load SPLADE v3 model and tokenizer
-# "naver/splade-v3" is a common and robust SPLADE v3 model.
-# Make sure you've accepted any user access agreements on its Hugging Face Hub page.
 try:
-    tokenizer = AutoTokenizer.from_pretrained("naver/splade-cocondenser-selfdistil")
-    model = AutoModelForMaskedLM.from_pretrained("naver/splade-cocondenser-selfdistil")
-    model.eval() # Set the model to evaluation mode for inference
-    print("SPLADE v3 model and tokenizer loaded successfully!")
 except Exception as e:
-    print(f"Error loading SPLADE model or tokenizer: {e}")
-    print("Please ensure you have accepted any user access agreements on the Hugging Face Hub page for 'naver/splade-v3' (https://huggingface.co/naver/splade-v3).")
-    print("If the problem persists, check your internet connection or try a different SPLADE model if available.")
-    tokenizer = None
-    model = None
-# --- Core SPLADE Representation Function ---
 def get_splade_representation(text):
-    if tokenizer is None or model is None:
         return "SPLADE model is not loaded. Please check the console for loading errors."
-    # Tokenize the input text
-    # return_tensors="pt" ensures PyTorch tensors are returned.
-    # padding=True pads to the longest sequence in the batch (though for single input, it's just the input length).
-    # truncation=True truncates if the text is too long for the model's max input size.
-    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
-    # Move inputs to the same device as the model (e.g., CPU or GPU)
-    # This is important if you were running on a GPU in a production environment.
-    # For Hugging Face Spaces free tier, it's usually CPU.
-    inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    # Get the model's output without calculating gradients (inference mode)
     with torch.no_grad():
-        output = model(**inputs)
-    # Extract the logits from the model's output.
-    # SPLADE uses the masked language modeling head's logits to derive term importance.
-    # Apply the SPLADE aggregation function: log(1 + ReLU(logits))
-    # This transforms the raw logits into a sparse vector where higher values indicate more importance.
-    # The attention_mask ensures we only consider actual tokens, not padding.
-    # Check if 'logits' is in the output (standard for AutoModelForMaskedLM)
     if hasattr(output, 'logits'):
-        # Apply the SPLADE transformation
-        # output.logits is typically [batch_size, sequence_length, vocab_size]
-        # We need to take the max over the sequence_length dimension to get a [batch_size, vocab_size] vector.
-        # inputs.attention_mask.unsqueeze(-1) expands the mask to match vocab_size for element-wise multiplication.
         splade_vector = torch.max(torch.log(1 + torch.relu(output.logits)) * inputs['attention_mask'].unsqueeze(-1), dim=1)[0].squeeze()
     else:
-        # Fallback/error message if the output structure is unexpected
         return "Model output structure not as expected for SPLADE. 'logits' not found."
-    # Convert the sparse vector to a human-readable format.
-    # We only care about the non-zero (or very small) entries, as they represent activated terms.
-    # Get the indices (token IDs) of the non-zero elements in the SPLADE vector
-    # torch.nonzero returns coordinates of non-zero elements. squeeze() removes dimensions of size 1.
-    # .cpu().tolist() moves the tensor to CPU and converts to a Python list.
     indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
-    # If it's a single index (e.g., a very short text), make it a list for consistent processing
     if not isinstance(indices, list):
         indices = [indices]
-    # Get the corresponding values (weights) for these non-zero indices
     values = splade_vector[indices].cpu().tolist()
-    # Create a dictionary mapping token ID to its weight
     token_weights = dict(zip(indices, values))
-    # Decode token IDs back to actual words/subwords
-    # Filter out common special tokens that are not meaningful for retrieval (e.g., [CLS], [SEP], [PAD])
-    # You can add more tokens to this list if they appear frequently and are not helpful.
     meaningful_tokens = {}
     for token_id, weight in token_weights.items():
-        decoded_token = tokenizer.decode([token_id])
-        # Filter out special tokens or very short/noisy tokens
         if decoded_token not in ["[CLS]", "[SEP]", "[PAD]", "[UNK]"] and len(decoded_token.strip()) > 0:
             meaningful_tokens[decoded_token] = weight
-    # Sort the meaningful tokens by their weight in descending order
     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
-    # Format the output for display
     formatted_output = "SPLADE Representation (Top 20 Terms):\n"
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
         for i, (term, weight) in enumerate(sorted_representation):
-            if i >= 20: # Limit to top 20 terms for readability
                 break
             formatted_output += f"- **{term}**: {weight:.4f}\n"
     formatted_output += "\n--- Raw SPLADE Vector Info ---\n"
     formatted_output += f"Total non-zero terms in vector: {len(indices)}\n"
-    formatted_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n" # Calculate sparsity
     return formatted_output
 # --- Gradio Interface Setup ---
 demo = gr.Interface(
-    fn=get_splade_representation,
-    inputs=gr.Textbox(
-        lines=5,
-        label="Enter your query or document text here:",
-        placeholder="Why is Padua the nicest city in Italy?"
-    ),
-    outputs=gr.Markdown(), # Use Markdown for richer text formatting (bolding terms)
-    title="🌌 SPLADE v3 Sparse Representation Generator",
-    description="Enter any text (query or document) to see its SPLADE v3 sparse vector representation. The output highlights the most important terms with their learned weights.",
-    allow_flagging="never" # Disable flagging for this demo
 )
 # Launch the Gradio app

 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
 import torch
 # --- Model Loading ---
+tokenizer_splade = None
+model_splade = None
+tokenizer_unicoil = None
+model_unicoil = None
+# Load SPLADE v3 model
+try:
+    tokenizer_splade = AutoTokenizer.from_pretrained("naver/splade-cocondenser-selfdistil")
+    model_splade = AutoModelForMaskedLM.from_pretrained("naver/splade-cocondenser-selfdistil")
+    model_splade.eval() # Set to evaluation mode for inference
+    print("SPLADE v3 model loaded successfully!")
+except Exception as e:
+    print(f"Error loading SPLADE model: {e}")
+    print("Please ensure you have accepted any user access agreements on the Hugging Face Hub page for 'naver/splade-cocondenser-selfdistil'.")
+# Load UNICOIL model for binary sparse encoding
 try:
+    # UNICOIL models are typically just AutoModel as they add a linear layer
+    # on top of a BERT-like encoder to predict weights.
+    # 'castorini/unicoil-msmarco-passage' is a common UNICOIL checkpoint.
+    unicoil_model_name = "castorini/unicoil-msmarco-passage"
+    tokenizer_unicoil = AutoTokenizer.from_pretrained(unicoil_model_name)
+    model_unicoil = AutoModel.from_pretrained(unicoil_model_name)
+    model_unicoil.eval() # Set to evaluation mode for inference
+    print(f"UNICOIL model '{unicoil_model_name}' loaded successfully!")
 except Exception as e:
+    print(f"Error loading UNICOIL model: {e}")
+    print(f"Please ensure '{unicoil_model_name}' is accessible (check Hugging Face Hub for potential agreements).")
+# --- Core Representation Functions ---
 def get_splade_representation(text):
+    if tokenizer_splade is None or model_splade is None:
         return "SPLADE model is not loaded. Please check the console for loading errors."
+    inputs = tokenizer_splade(text, return_tensors="pt", padding=True, truncation=True)
+    inputs = {k: v.to(model_splade.device) for k, v in inputs.items()}
     with torch.no_grad():
+        output = model_splade(**inputs)
     if hasattr(output, 'logits'):
         splade_vector = torch.max(torch.log(1 + torch.relu(output.logits)) * inputs['attention_mask'].unsqueeze(-1), dim=1)[0].squeeze()
     else:
         return "Model output structure not as expected for SPLADE. 'logits' not found."
     indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
         indices = [indices]
     values = splade_vector[indices].cpu().tolist()
     token_weights = dict(zip(indices, values))
     meaningful_tokens = {}
     for token_id, weight in token_weights.items():
+        decoded_token = tokenizer_splade.decode([token_id])
         if decoded_token not in ["[CLS]", "[SEP]", "[PAD]", "[UNK]"] and len(decoded_token.strip()) > 0:
             meaningful_tokens[decoded_token] = weight
     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
     formatted_output = "SPLADE Representation (Top 20 Terms):\n"
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
         for i, (term, weight) in enumerate(sorted_representation):
+            if i >= 20:
                 break
             formatted_output += f"- **{term}**: {weight:.4f}\n"
     formatted_output += "\n--- Raw SPLADE Vector Info ---\n"
     formatted_output += f"Total non-zero terms in vector: {len(indices)}\n"
+    formatted_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade.vocab_size):.2%}\n"
+    return formatted_output
+def get_unicoil_binary_representation(text):
+    if tokenizer_unicoil is None or model_unicoil is None:
+        return "UNICOIL model is not loaded. Please check the console for loading errors."
+    inputs = tokenizer_unicoil(text, return_tensors="pt", padding=True, truncation=True)
+    inputs = {k: v.to(model_unicoil.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        # UNICOIL models often output a dictionary where 'token_scores' or similar
+        # contain the learned weights for each token. The structure can vary.
+        # For 'castorini/unicoil-msmarco-passage', the token scores are typically
+        # the last hidden state of the model, which is then mapped by a linear layer
+        # into the sparse weights. We might need to manually extract those,
+        # or the model itself might be set up to produce the weights directly.
+        # Based on typical UNICOIL implementations, we usually take the output
+        # from the last layer and map it to vocabulary size.
+        # In many UNICOIL variations, the model itself is designed to output
+        # the "re-weight" scores for each token. Let's assume for simplicity
+        # that the model's forward pass returns something that can be interpreted
+        # as per-token scores, perhaps in `output.last_hidden_state`
+        # and then a simple linear layer (not part of the AutoModel usually)
+        # would project this to vocab size.
+        # For simplicity and to fit `AutoModel`, we'll treat the last hidden state
+        # directly as the basis for term importance for now, which is common in similar models,
+        # or if the model already has a head, we use it.
+        # A more robust UNICOIL implementation would involve a specific head
+        # if not using AutoModelForMaskedLM. However, AutoModel gives us the
+        # last hidden states from which we can infer.
+        # For UNICOIL, we're interested in the weighted token scores.
+        # `model(**inputs)` will typically return a `BaseModelOutput`
+        # or `MaskedLMOutput` if it's based on an MLM.
+        # Let's assume the model's output provides the token importance.
+        # A common way to get UNICOIL scores if not explicitly provided as logits:
+        # It's usually a linear layer on top of the last hidden state.
+        # Since AutoModel just gives the base model, we'll mimic the output
+        # as a direct mapping if the model doesn't have a specific head for scores.
+        # However, looking at `castorini/unicoil-msmarco-passage`
+        # its `config.json` might give hints or the model itself is structured.
+        # Often, it uses `BertForMaskedLM` and then applies `log(1+relu)` to the logits.
+        # Let's assume it behaves similar to SPLADE for simplicity of extraction for now,
+        # or we might need to load it as `AutoModelForMaskedLM` if its internal structure
+        # is indeed like that, and then apply a binarization.
+        # Re-evaluating: UNICOIL typically *learns* explicit token weights.
+        # The common approach for UNICOIL with Hugging Face is indeed to load it
+        # as `AutoModelForMaskedLM` and use its `logits` output, similar to SPLADE,
+        # but with a different aggregation strategy.
+        # Let's verify the model type for 'castorini/unicoil-msmarco-passage'.
+        # Its config.json and architecture implies it's a BertForMaskedLM variant.
+        output = model_unicoil(**inputs) # This should be a BaseModelOutputWithPooling or similar
+        if not hasattr(output, 'logits'):
+            # If `model_unicoil` is an `AutoModel` without a classification head,
+            # we need to add a way to get per-token scores.
+            # This is where a custom model head or a specific model class would be needed.
+            # For `castorini/unicoil-msmarco-passage`, it *is* an MLM variant.
+            # So, `output.logits` *should* be available.
+            return "UNICOIL model output structure not as expected. 'logits' not found."
+        # UNICOIL's output is also typically per-token scores from the MLM head.
+        # For UNICOIL, the weights are often taken directly from the logits after pooling.
+        # Unlike SPLADE's log(1+ReLU), UNICOIL's approach can be simpler,
+        # sometimes just taking the maximum of logits (or similar pooling).
+        # A common binarization for UNICOIL is based on the sign of the re-weighted scores.
+        # Let's mimic a common UNICOIL interpretation for obtaining sparse weights
+        # from the logits. The weights are usually sparse and positive.
+        # We can apply a threshold for binarization.
+        # This is a simplification; actual UNICOIL might have specific layers.
+        # For `castorini/unicoil-msmarco-passage`, it uses the `log(1+exp(logits))` formulation
+        # followed by max pooling, then often binarization based on a threshold.
+        # Applying a common interpretation of UNICOIL-like score generation for sparse weights:
+        # Instead of `log(1+ReLU(logits))`, it often uses `torch.log(1 + torch.exp(output.logits))`.
+        # This is essentially the softplus function, which makes values positive and sparse.
+        # Get the sparse weights using the UNICOIL-like transformation
+        sparse_weights = torch.max(torch.log(1 + torch.exp(output.logits)) * inputs['attention_mask'].unsqueeze(-1), dim=1)[0].squeeze()
+        # --- Binarization Step for UNICOIL ---
+        # For true "binary sparse", we threshold these sparse weights.
+        # A common approach is to simply take any non-zero value as 1, and zero as 0.
+        # Or, define a small threshold for binarization if values are very small but non-zero.
+        # For simplicity, let's treat anything above a very small epsilon as 1.
+        # Convert to binary: 1 if weight > epsilon, else 0
+        threshold = 1e-6 # Define a small threshold for binarization
+        binary_sparse_vector = (sparse_weights > threshold).int()
+        # Get indices of the '1's in the binary vector
+        binary_indices = torch.nonzero(binary_sparse_vector).squeeze().cpu().tolist()
+        if not isinstance(binary_indices, list):
+            binary_indices = [binary_indices] if binary_indices.numel() > 0 else []
+        # Map token IDs back to terms for the binary representation
+        binary_terms = {}
+        for token_id in binary_indices:
+            decoded_token = tokenizer_unicoil.decode([token_id])
+            if decoded_token not in ["[CLS]", "[SEP]", "[PAD]", "[UNK]"] and len(decoded_token.strip()) > 0:
+                binary_terms[decoded_token] = 1 # Value is always 1 for binary
+        sorted_binary_terms = sorted(binary_terms.items(), key=lambda item: item[0]) # Sort by term for consistent display
+        formatted_output = "UNICOIL Binary Sparse Representation (Activated Terms):\n"
+        if not sorted_binary_terms:
+            formatted_output += "No significant terms activated for this input.\n"
+        else:
+            # Display up to 50 activated terms for readability
+            for i, (term, _) in enumerate(sorted_binary_terms):
+                if i >= 50:
+                    break
+                formatted_output += f"- **{term}**\n" # Only show term, as weight is always 1
+            if len(sorted_binary_terms) > 50:
+                formatted_output += f"...and {len(sorted_binary_terms) - 50} more terms.\n"
+        formatted_output += "\n--- Raw Binary Sparse Vector Info ---\n"
+        formatted_output += f"Total activated terms: {len(binary_indices)}\n"
+        # Calculate sparsity based on the number of '1's vs. total vocabulary size
+        formatted_output += f"Sparsity: {1 - (len(binary_indices) / tokenizer_unicoil.vocab_size):.2%}\n"
     return formatted_output
+# --- Unified Prediction Function for Gradio ---
+def predict_representation(model_choice, text):
+    if model_choice == "SPLADE":
+        return get_splade_representation(text)
+    elif model_choice == "UNICOIL (Binary Sparse)":
+        return get_unicoil_binary_representation(text)
+    else:
+        return "Please select a model."
 # --- Gradio Interface Setup ---
 demo = gr.Interface(
+    fn=predict_representation,
+    inputs=[
+        gr.Radio(
+            ["SPLADE", "UNICOIL (Binary Sparse)"], # Added UNICOIL option
+            label="Choose Representation Model",
+            value="SPLADE" # Default selection
+        ),
+        gr.Textbox(
+            lines=5,
+            label="Enter your query or document text here:",
+            placeholder="e.g., Why is Padua the nicest city in Italy?"
+        )
+    ],
+    outputs=gr.Markdown(),
+    title="🌌 Sparse and Binary Sparse Representation Generator",
+    description="Enter any text to see its SPLADE sparse vector or UNICOIL binary sparse representation.",
+    allow_flagging="never"
 )
 # Launch the Gradio app