Spaces:

SiddharthAK
/

TextLSRDemo

Running

App Files Files Community

SiddharthAK commited on 19 days ago

Commit

372cab2

verified ·

1 Parent(s): d302c88

Update app.py

Browse files

Files changed (1) hide show

app.py +192 -3

app.py CHANGED Viewed

@@ -244,11 +244,170 @@ def predict_representation_explorer(model_choice, text):
     else:
         return "Please select a model."
 # --- Gradio Interface Setup with Tabs ---
 with gr.Blocks(title="SPLADE Demos") as demo:
-    gr.Markdown("# 🌌 SPLADE Demos: Sparse Representation Explorer") # Updated title
-    gr.Markdown("Explore different SPLADE models and their sparse representation types.") # Updated description
     with gr.Tabs():
         with gr.TabItem("Sparse Representation Explorer"):
@@ -275,5 +434,35 @@ with gr.Blocks(title="SPLADE Demos") as demo:
                 allow_flagging="never",
                 # live=True # Setting live=True might be slow for complex models on every keystroke
             )
-demo.launch()

     else:
         return "Please select a model."
+# --- NEW: Core Representation Functions (Return RAW TENSORS - for Dot Product Tab) ---
+def get_splade_cocondenser_vector(text):
+    if tokenizer_splade is None or model_splade is None:
+        return None
+    inputs = tokenizer_splade(text, return_tensors="pt", padding=True, truncation=True)
+    inputs = {k: v.to(model_splade.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        output = model_splade(**inputs)
+    if hasattr(output, 'logits'):
+        splade_vector = torch.max(
+            torch.log(1 + torch.relu(output.logits)) * inputs['attention_mask'].unsqueeze(-1),
+            dim=1
+        )[0].squeeze()
+        return splade_vector
+    return None
+def get_splade_lexical_vector(text):
+    if tokenizer_splade_lexical is None or model_splade_lexical is None:
+        return None
+    inputs = tokenizer_splade_lexical(text, return_tensors="pt", padding=True, truncation=True)
+    inputs = {k: v.to(model_splade_lexical.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        output = model_splade_lexical(**inputs)
+    if hasattr(output, 'logits'):
+        splade_vector = torch.max(
+            torch.log(1 + torch.relu(output.logits)) * inputs['attention_mask'].unsqueeze(-1),
+            dim=1
+        )[0].squeeze()
+        vocab_size = tokenizer_splade_lexical.vocab_size
+        bow_mask = create_lexical_bow_mask(
+            inputs['input_ids'], vocab_size, tokenizer_splade_lexical
+        ).squeeze()
+        splade_vector = splade_vector * bow_mask
+        return splade_vector
+    return None
+def get_splade_doc_vector(text):
+    if tokenizer_splade_doc is None or model_splade_doc is None:
+        return None
+    inputs = tokenizer_splade_doc(text, return_tensors="pt", padding=True, truncation=True)
+    inputs = {k: v.to(model_splade_doc.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        output = model_splade_doc(**inputs)
+    if hasattr(output, "logits"):
+        vocab_size = tokenizer_splade_doc.vocab_size
+        binary_splade_vector = create_lexical_bow_mask(
+            inputs['input_ids'], vocab_size, tokenizer_splade_doc
+        ).squeeze()
+        return binary_splade_vector
+    return None
+# --- NEW: Function to get formatted representation from a raw vector and tokenizer ---
+def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
+    if splade_vector is None:
+        return "Failed to generate vector."
+    indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
+    if not isinstance(indices, list):
+        indices = [indices] if indices else []
+    if is_binary:
+        values = [1.0] * len(indices)
+    else:
+        values = splade_vector[indices].cpu().tolist()
+    token_weights = dict(zip(indices, values))
+    meaningful_tokens = {}
+    for token_id, weight in token_weights.items():
+        decoded_token = tokenizer.decode([token_id])
+        if decoded_token not in ["[CLS]", "[SEP]", "[PAD]", "[UNK]"] and len(decoded_token.strip()) > 0:
+            meaningful_tokens[decoded_token] = weight
+    if is_binary:
+        sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for binary
+    else:
+        sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
+    formatted_output = ""
+    if not sorted_representation:
+        formatted_output += "No significant terms found.\n"
+    else:
+        for i, (term, weight) in enumerate(sorted_representation):
+            if i >= 50 and is_binary: # Limit display for very long binary lists
+                formatted_output += f"...and {len(sorted_representation) - 50} more terms.\n"
+                break
+            if is_binary:
+                formatted_output += f"- **{term}**\n"
+            else:
+                formatted_output += f"- **{term}**: {weight:.4f}\n"
+    formatted_output += f"\nTotal non-zero terms: {len(indices)}\n"
+    formatted_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n"
+    return formatted_output
+# --- NEW: Dot Product Calculation Function for the new tab ---
+def calculate_dot_product_and_representations(model_choice, query_text, doc_text):
+    query_vector = None
+    doc_vector = None
+    query_rep_str = ""
+    doc_rep_str = ""
+    selected_tokenizer = None
+    if model_choice == "SPLADE-cocondenser-distil (weighting and expansion)":
+        query_vector = get_splade_cocondenser_vector(query_text)
+        doc_vector = get_splade_cocondenser_vector(doc_text)
+        selected_tokenizer = tokenizer_splade
+        query_rep_str = "Query SPLADE-cocondenser-distil Representation (Weighting and Expansion):\n"
+        doc_rep_str = "Document SPLADE-cocondenser-distil Representation (Weighting and Expansion):\n"
+        is_binary = False
+    elif model_choice == "SPLADE-v3-Lexical (weighting)":
+        query_vector = get_splade_lexical_vector(query_text)
+        doc_vector = get_splade_lexical_vector(doc_text)
+        selected_tokenizer = tokenizer_splade_lexical
+        query_rep_str = "Query SPLADE-v3-Lexical Representation (Weighting):\n"
+        doc_rep_str = "Document SPLADE-v3-Lexical Representation (Weighting):\n"
+        is_binary = False
+    elif model_choice == "SPLADE-v3-Doc (binary)":
+        query_vector = get_splade_doc_vector(query_text)
+        doc_vector = get_splade_doc_vector(doc_text)
+        selected_tokenizer = tokenizer_splade_doc
+        query_rep_str = "Query SPLADE-v3-Doc Representation (Binary):\n"
+        doc_rep_str = "Document SPLADE-v3-Doc Representation (Binary):\n"
+        is_binary = True
+    else:
+        return "Please select a model.", "", ""
+    if query_vector is None or doc_vector is None:
+        return "Failed to generate one or both vectors. Please check model loading.", "", ""
+    # Calculate dot product
+    dot_product = float(torch.dot(query_vector.cpu(), doc_vector.cpu()).item())
+    # Format representations
+    query_rep_str += format_sparse_vector_output(query_vector, selected_tokenizer, is_binary)
+    doc_rep_str += format_sparse_vector_output(doc_vector, selected_tokenizer, is_binary)
+    # Combine output
+    full_output = f"### Dot Product Score: {dot_product:.6f}\n\n"
+    full_output += "---\n\n"
+    full_output += f"{query_rep_str}\n\n---\n\n{doc_rep_str}"
+    return full_output
 # --- Gradio Interface Setup with Tabs ---
 with gr.Blocks(title="SPLADE Demos") as demo:
+    gr.Markdown("# 🌌 SPLADE Demos: Sparse Representation Explorer and Retriever") # Updated title
+    gr.Markdown("Explore different SPLADE models and their sparse representation types, and calculate similarity between query and document representations.") # Updated description
     with gr.Tabs():
         with gr.TabItem("Sparse Representation Explorer"):
                 allow_flagging="never",
                 # live=True # Setting live=True might be slow for complex models on every keystroke
             )
+        with gr.TabItem("Query-Document Dot Product Calculator"): # NEW TAB
+            gr.Markdown("### Calculate Dot Product Similarity between Query and Document")
+            gr.Markdown("Select a SPLADE model to encode both your query and document, then see their sparse representations and their similarity score.")
+            gr.Interface(
+                fn=calculate_dot_product_and_representations,
+                inputs=[
+                    gr.Radio(
+                        [
+                            "SPLADE-cocondenser-distil (weighting and expansion)",
+                            "SPLADE-v3-Lexical (weighting)",
+                            "SPLADE-v3-Doc (binary)"
+                        ],
+                        label="Choose Encoding Model",
+                        value="SPLADE-cocondenser-distil (weighting and expansion)"
+                    ),
+                    gr.Textbox(
+                        lines=3,
+                        label="Enter Query Text:",
+                        placeholder="e.g., best pizza in Naples"
+                    ),
+                    gr.Textbox(
+                        lines=5,
+                        label="Enter Document Text:",
+                        placeholder="e.g., Naples is famous for its delicious pizza, known for its soft, chewy crust and fresh ingredients."
+                    )
+                ],
+                outputs=gr.Markdown(),
+                allow_flagging="never"
+            )
+demo.launch()