Spaces:

SiddharthAK
/

TextLSRDemo

Running

App Files Files Community

SiddharthAK commited on 19 days ago

Commit

da0c779

verified ·

1 Parent(s): 5bf8193

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -27

app.py CHANGED Viewed

@@ -113,12 +113,15 @@ def get_splade_cocondenser_representation(text):
     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
-    formatted_output = "MLM encoder (SPLADE-cocondenser-distil):\n"
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
         for term, weight in sorted_representation:
-            formatted_output += f"- **{term}**: {weight:.4f}\n"
     info_output = f"--- Sparse Vector Info ---\n"
     info_output += f"Total non-zero terms in vector: {len(indices)}\n"
@@ -168,12 +171,15 @@ def get_splade_lexical_representation(text):
     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
-    formatted_output = "SPLADE-v3-Lexical Representation (Weighting):\n"
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
         for term, weight in sorted_representation:
-            formatted_output += f"- **{term}**: {weight:.4f}\n"
     info_output = f"--- Raw Sparse Vector Info ---\n"
     info_output += f"Total non-zero terms in vector: {len(indices)}\n"
@@ -210,15 +216,15 @@ def get_splade_doc_representation(text):
     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
-    formatted_output = "Binary Bag-of-Words Representation:\n" # Changed title
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
-        for i, (term, _) in enumerate(sorted_representation):
-            if i >= 50: # Limit display for very long lists
-                formatted_output += f"...and {len(sorted_representation) - 50} more terms.\n"
-                break
-            formatted_output += f"- **{term}**\n"
     info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" # Changed title
     info_output += f"Total activated terms: {len(indices)}\n"
@@ -302,7 +308,7 @@ def get_splade_doc_vector(text):
 # This function remains unchanged as it's a generic formatter for any sparse vector.
 def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
     if splade_vector is None:
-        return "Failed to generate vector."
     indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
@@ -326,20 +332,23 @@ def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
     else:
         sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
-    formatted_output = ""
     if not sorted_representation:
         formatted_output += "No significant terms found.\n"
     else:
         for i, (term, weight) in enumerate(sorted_representation):
-            if i >= 50 and is_binary: # Limit display for very long binary lists
-                formatted_output += f"...and {len(sorted_representation) - 50} more terms.\n"
                 break
             if is_binary:
-                formatted_output += f"- **{term}**\n"
             else:
-                formatted_output += f"- **{term}**: {weight:.4f}\n"
-    info_output = f"\nTotal non-zero terms: {len(indices)}\n"
     info_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n"
     return formatted_output, info_output # Now returns two strings
@@ -375,22 +384,23 @@ def calculate_dot_product_and_representations_independent(query_model_choice, do
     # and to ensure .item() works reliably for conversion to float.
     dot_product = float(torch.dot(query_vector.cpu(), doc_vector.cpu()).item())
-    # Format representations
-    # These functions now return two strings (main_output, info_output)
     query_main_rep_str, query_info_str = format_sparse_vector_output(query_vector, query_tokenizer, query_is_binary)
     doc_main_rep_str, doc_info_str = format_sparse_vector_output(doc_vector, doc_tokenizer, doc_is_binary)
-    query_rep_str = f"Query Representation ({query_model_name_display}):\n"
-    query_rep_str += query_main_rep_str + "\n" + query_info_str
-    doc_rep_str = f"Document Representation ({doc_model_name_display}):\n"
-    doc_rep_str += doc_main_rep_str + "\n" + doc_info_str
-    # Combine output
     full_output = f"### Dot Product Score: {dot_product:.6f}\n\n"
     full_output += "---\n\n"
-    full_output += f"{query_rep_str}\n\n---\n\n{doc_rep_str}"
     return full_output

     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
+    formatted_output = "MLM encoder (SPLADE-cocondenser-distil):\n\n" # Added newline
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
+        # Changed to paragraph style
+        terms_list = []
         for term, weight in sorted_representation:
+            terms_list.append(f"**{term}**: {weight:.4f}")
+        formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
     info_output = f"--- Sparse Vector Info ---\n"
     info_output += f"Total non-zero terms in vector: {len(indices)}\n"
     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
+    formatted_output = "SPLADE-v3-Lexical Representation (Weighting):\n\n" # Added newline
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
+        # Changed to paragraph style
+        terms_list = []
         for term, weight in sorted_representation:
+            terms_list.append(f"**{term}**: {weight:.4f}")
+        formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
     info_output = f"--- Raw Sparse Vector Info ---\n"
     info_output += f"Total non-zero terms in vector: {len(indices)}\n"
     sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
+    formatted_output = "Binary Bag-of-Words Representation:\n\n" # Changed title, added newline
     if not sorted_representation:
         formatted_output += "No significant terms found for this input.\n"
     else:
+        # Changed to paragraph style
+        terms_list = []
+        for term, _ in sorted_representation: # For binary, weight is always 1, so no need to display
+            terms_list.append(f"**{term}**")
+        formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
     info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" # Changed title
     info_output += f"Total activated terms: {len(indices)}\n"
 # This function remains unchanged as it's a generic formatter for any sparse vector.
 def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
     if splade_vector is None:
+        return "Failed to generate vector.", ""
     indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
     if not isinstance(indices, list):
     else:
         sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
+    formatted_output = "" # Removed initial newline to allow control outside
     if not sorted_representation:
         formatted_output += "No significant terms found.\n"
     else:
+        terms_list = []
         for i, (term, weight) in enumerate(sorted_representation):
+            # Limit display for very long lists, but ensure it's still a paragraph if cut
+            if i >= 50:
+                terms_list.append(f"...and {len(sorted_representation) - 50} more terms.")
                 break
             if is_binary:
+                terms_list.append(f"**{term}**")
             else:
+                terms_list.append(f"**{term}**: {weight:.4f}")
+        formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
+    info_output = f"Total non-zero terms: {len(indices)}\n"
     info_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n"
     return formatted_output, info_output # Now returns two strings
     # and to ensure .item() works reliably for conversion to float.
     dot_product = float(torch.dot(query_vector.cpu(), doc_vector.cpu()).item())
+    # Format representations - these functions now return two strings (main_output, info_output)
     query_main_rep_str, query_info_str = format_sparse_vector_output(query_vector, query_tokenizer, query_is_binary)
     doc_main_rep_str, doc_info_str = format_sparse_vector_output(doc_vector, doc_tokenizer, doc_is_binary)
+    # Combine output into a single string for the Markdown component
     full_output = f"### Dot Product Score: {dot_product:.6f}\n\n"
     full_output += "---\n\n"
+    # Query Representation
+    full_output += f"Query Representation ({query_model_name_display}):\n\n"
+    full_output += query_main_rep_str + "\n\n" + query_info_str # Added an extra newline for better spacing
+    full_output += "\n\n---\n\n" # Separator
+    # Document Representation
+    full_output += f"Document Representation ({doc_model_name_display}):\n\n"
+    full_output += doc_main_rep_str + "\n\n" + doc_info_str # Added an extra newline for better spacing
     return full_output