Spaces:

SiddharthAK
/

TextLSRDemo

Running

App Files Files Community

SiddharthAK commited on 20 days ago

Commit

4a365e4

verified ·

1 Parent(s): 63d582f

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -6

app.py CHANGED Viewed

@@ -7,6 +7,8 @@ tokenizer_splade = None
 model_splade = None
 tokenizer_splade_lexical = None
 model_splade_lexical = None
 # Load SPLADE v3 model (original)
 try:
@@ -29,6 +31,18 @@ except Exception as e:
     print(f"Error loading SPLADE v3 Lexical model: {e}")
     print(f"Please ensure '{splade_lexical_model_name}' is accessible (check Hugging Face Hub for potential agreements).")
 # --- Helper function for lexical mask ---
 def create_lexical_bow_mask(input_ids, vocab_size, tokenizer):
     """
@@ -107,7 +121,7 @@ def get_splade_representation(text):
     return formatted_output
-def get_splade_lexical_representation(text): # Removed apply_lexical_mask parameter
     if tokenizer_splade_lexical is None or model_splade_lexical is None:
         return "SPLADE v3 Lexical model is not loaded. Please check the console for loading errors."
@@ -167,12 +181,74 @@ def get_splade_lexical_representation(text): # Removed apply_lexical_mask parame
     return formatted_output
 # --- Unified Prediction Function for Gradio ---
 def predict_representation(model_choice, text):
     if model_choice == "SPLADE (cocondenser)":
         return get_splade_representation(text)
-    elif model_choice == "SPLADE-v3-Lexical": # Simplified choice
-        return get_splade_lexical_representation(text) # Always applies lexical mask
     else:
         return "Please select a model."
@@ -182,8 +258,10 @@ demo = gr.Interface(
     inputs=[
         gr.Radio(
             [
-                "SPLADE-cocondenser-distil (expansion and weighting)",
-                "SPLADE-v3-Lexical (weighting only)" # Simplified option
             ],
             label="Choose Representation Model",
             value="SPLADE (cocondenser)" # Default selection
@@ -196,7 +274,7 @@ demo = gr.Interface(
     ],
     outputs=gr.Markdown(),
     title="🌌 Sparse Representation Generator",
-    description="Enter any text to see its SPLADE sparse vector.", # Simplified description
     allow_flagging="never"
 )

 model_splade = None
 tokenizer_splade_lexical = None
 model_splade_lexical = None
+tokenizer_splade_doc = None # New tokenizer for SPLADE-v3-Doc
+model_splade_doc = None     # New model for SPLADE-v3-Doc
 # Load SPLADE v3 model (original)
 try:
     print(f"Error loading SPLADE v3 Lexical model: {e}")
     print(f"Please ensure '{splade_lexical_model_name}' is accessible (check Hugging Face Hub for potential agreements).")
+# Load SPLADE v3 Doc model (NEW)
+try:
+    splade_doc_model_name = "naver/splade-v3-doc"
+    tokenizer_splade_doc = AutoTokenizer.from_pretrained(splade_doc_model_name)
+    model_splade_doc = AutoModelForMaskedLM.from_pretrained(splade_doc_model_name)
+    model_splade_doc.eval() # Set to evaluation mode for inference
+    print(f"SPLADE v3 Doc model '{splade_doc_model_name}' loaded successfully!")
+except Exception as e:
+    print(f"Error loading SPLADE v3 Doc model: {e}")
+    print(f"Please ensure '{splade_doc_model_name}' is accessible (check Hugging Face Hub for potential agreements).")
 # --- Helper function for lexical mask ---
 def create_lexical_bow_mask(input_ids, vocab_size, tokenizer):
     """
     return formatted_output
+def get_splade_lexical_representation(text):
     if tokenizer_splade_lexical is None or model_splade_lexical is None:
         return "SPLADE v3 Lexical model is not loaded. Please check the console for loading errors."
     return formatted_output
+# NEW: Function for SPLADE-v3-Doc representation
+def get_splade_doc_representation(text, apply_lexical_mask: bool):
+    if tokenizer_splade_doc is None or model_splade_doc is None:
+        return "SPLADE v3 Doc model is not loaded. Please check the console for loading errors."
+    inputs = tokenizer_splade_doc(text, return_tensors="pt", padding=True, truncation=True)
+    inputs = {k: v.to(model_splade_doc.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        output = model_splade_doc(**inputs)
+    if hasattr(output, 'logits'):
+        splade_vector = torch.max(
+            torch.log(1 + torch.relu(output.logits)) * inputs['attention_mask'].unsqueeze(-1),
+            dim=1
+        )[0].squeeze()
+    else:
+        return "Model output structure not as expected for SPLADE v3 Doc. 'logits' not found."
+    # --- Apply Lexical Mask if requested ---
+    if apply_lexical_mask:
+        vocab_size = tokenizer_splade_doc.vocab_size
+        bow_mask = create_lexical_bow_mask(
+            inputs['input_ids'], vocab_size, tokenizer_splade_doc
+        ).squeeze()
+        splade_vector = splade_vector * bow_mask
+    # --- End Lexical Mask Logic ---
+    indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
+    if not isinstance(indices, list):
+        indices = [indices]
+    values = splade_vector[indices].cpu().tolist()
+    token_weights = dict(zip(indices, values))
+    meaningful_tokens = {}
+    for token_id, weight in token_weights.items():
+        decoded_token = tokenizer_splade_doc.decode([token_id])
+        if decoded_token not in ["[CLS]", "[SEP]", "[PAD]", "[UNK]"] and len(decoded_token.strip()) > 0:
+            meaningful_tokens[decoded_token] = weight
+    sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
+    formatted_output = "SPLADE v3 Doc Representation (All Non-Zero Terms):\n"
+    if not sorted_representation:
+        formatted_output += "No significant terms found for this input.\n"
+    else:
+        for term, weight in sorted_representation:
+            formatted_output += f"- **{term}**: {weight:.4f}\n"
+    formatted_output += "\n--- Raw SPLADE Vector Info ---\n"
+    formatted_output += f"Total non-zero terms in vector: {len(indices)}\n"
+    formatted_output += f"Sparsity: {1 - (len(indices) / tokenizer_splade_doc.vocab_size):.2%}\n"
+    return formatted_output
 # --- Unified Prediction Function for Gradio ---
 def predict_representation(model_choice, text):
     if model_choice == "SPLADE (cocondenser)":
         return get_splade_representation(text)
+    elif model_choice == "SPLADE-v3-Lexical":
+        # Always applies lexical mask for this option as per last request
+        return get_splade_lexical_representation(text)
+    elif model_choice == "SPLADE-v3-Doc (with expansion)": # New option
+        return get_splade_doc_representation(text, apply_lexical_mask=False)
+    elif model_choice == "SPLADE-v3-Doc (lexical-only)": # New option
+        return get_splade_doc_representation(text, apply_lexical_mask=True)
     else:
         return "Please select a model."
     inputs=[
         gr.Radio(
             [
+                "SPLADE (cocondenser)",
+                "SPLADE-v3-Lexical", # Lexical-only by default now
+                "SPLADE-v3-Doc (with expansion)", # Option to see full neural output
+                "SPLADE-v3-Doc (lexical-only)"    # Option with lexical mask applied
             ],
             label="Choose Representation Model",
             value="SPLADE (cocondenser)" # Default selection
     ],
     outputs=gr.Markdown(),
     title="🌌 Sparse Representation Generator",
+    description="Enter any text to see its SPLADE sparse vector. Explore different SPLADE models and their expansion behaviors.",
     allow_flagging="never"
 )