Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -113,12 +113,15 @@ def get_splade_cocondenser_representation(text):
|
|
113 |
|
114 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
115 |
|
116 |
-
formatted_output = "MLM encoder (SPLADE-cocondenser-distil):\n"
|
117 |
if not sorted_representation:
|
118 |
formatted_output += "No significant terms found for this input.\n"
|
119 |
else:
|
|
|
|
|
120 |
for term, weight in sorted_representation:
|
121 |
-
|
|
|
122 |
|
123 |
info_output = f"--- Sparse Vector Info ---\n"
|
124 |
info_output += f"Total non-zero terms in vector: {len(indices)}\n"
|
@@ -168,12 +171,15 @@ def get_splade_lexical_representation(text):
|
|
168 |
|
169 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
170 |
|
171 |
-
formatted_output = "SPLADE-v3-Lexical Representation (Weighting):\n"
|
172 |
if not sorted_representation:
|
173 |
formatted_output += "No significant terms found for this input.\n"
|
174 |
else:
|
|
|
|
|
175 |
for term, weight in sorted_representation:
|
176 |
-
|
|
|
177 |
|
178 |
info_output = f"--- Raw Sparse Vector Info ---\n"
|
179 |
info_output += f"Total non-zero terms in vector: {len(indices)}\n"
|
@@ -210,15 +216,15 @@ def get_splade_doc_representation(text):
|
|
210 |
|
211 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
|
212 |
|
213 |
-
formatted_output = "Binary Bag-of-Words Representation:\n" # Changed title
|
214 |
if not sorted_representation:
|
215 |
formatted_output += "No significant terms found for this input.\n"
|
216 |
else:
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
|
223 |
info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" # Changed title
|
224 |
info_output += f"Total activated terms: {len(indices)}\n"
|
@@ -302,7 +308,7 @@ def get_splade_doc_vector(text):
|
|
302 |
# This function remains unchanged as it's a generic formatter for any sparse vector.
|
303 |
def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
|
304 |
if splade_vector is None:
|
305 |
-
return "Failed to generate vector."
|
306 |
|
307 |
indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
|
308 |
if not isinstance(indices, list):
|
@@ -326,20 +332,23 @@ def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
|
|
326 |
else:
|
327 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
328 |
|
329 |
-
formatted_output = ""
|
330 |
if not sorted_representation:
|
331 |
formatted_output += "No significant terms found.\n"
|
332 |
else:
|
|
|
333 |
for i, (term, weight) in enumerate(sorted_representation):
|
334 |
-
|
335 |
-
|
|
|
336 |
break
|
337 |
if is_binary:
|
338 |
-
|
339 |
else:
|
340 |
-
|
|
|
341 |
|
342 |
-
info_output = f"
|
343 |
info_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n"
|
344 |
|
345 |
return formatted_output, info_output # Now returns two strings
|
@@ -375,22 +384,23 @@ def calculate_dot_product_and_representations_independent(query_model_choice, do
|
|
375 |
# and to ensure .item() works reliably for conversion to float.
|
376 |
dot_product = float(torch.dot(query_vector.cpu(), doc_vector.cpu()).item())
|
377 |
|
378 |
-
# Format representations
|
379 |
-
# These functions now return two strings (main_output, info_output)
|
380 |
query_main_rep_str, query_info_str = format_sparse_vector_output(query_vector, query_tokenizer, query_is_binary)
|
381 |
doc_main_rep_str, doc_info_str = format_sparse_vector_output(doc_vector, doc_tokenizer, doc_is_binary)
|
382 |
|
383 |
|
384 |
-
|
385 |
-
query_rep_str += query_main_rep_str + "\n" + query_info_str
|
386 |
-
|
387 |
-
doc_rep_str = f"Document Representation ({doc_model_name_display}):\n"
|
388 |
-
doc_rep_str += doc_main_rep_str + "\n" + doc_info_str
|
389 |
-
|
390 |
-
# Combine output
|
391 |
full_output = f"### Dot Product Score: {dot_product:.6f}\n\n"
|
392 |
full_output += "---\n\n"
|
393 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
394 |
|
395 |
return full_output
|
396 |
|
|
|
113 |
|
114 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
115 |
|
116 |
+
formatted_output = "MLM encoder (SPLADE-cocondenser-distil):\n\n" # Added newline
|
117 |
if not sorted_representation:
|
118 |
formatted_output += "No significant terms found for this input.\n"
|
119 |
else:
|
120 |
+
# Changed to paragraph style
|
121 |
+
terms_list = []
|
122 |
for term, weight in sorted_representation:
|
123 |
+
terms_list.append(f"**{term}**: {weight:.4f}")
|
124 |
+
formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
|
125 |
|
126 |
info_output = f"--- Sparse Vector Info ---\n"
|
127 |
info_output += f"Total non-zero terms in vector: {len(indices)}\n"
|
|
|
171 |
|
172 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
173 |
|
174 |
+
formatted_output = "SPLADE-v3-Lexical Representation (Weighting):\n\n" # Added newline
|
175 |
if not sorted_representation:
|
176 |
formatted_output += "No significant terms found for this input.\n"
|
177 |
else:
|
178 |
+
# Changed to paragraph style
|
179 |
+
terms_list = []
|
180 |
for term, weight in sorted_representation:
|
181 |
+
terms_list.append(f"**{term}**: {weight:.4f}")
|
182 |
+
formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
|
183 |
|
184 |
info_output = f"--- Raw Sparse Vector Info ---\n"
|
185 |
info_output += f"Total non-zero terms in vector: {len(indices)}\n"
|
|
|
216 |
|
217 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
|
218 |
|
219 |
+
formatted_output = "Binary Bag-of-Words Representation:\n\n" # Changed title, added newline
|
220 |
if not sorted_representation:
|
221 |
formatted_output += "No significant terms found for this input.\n"
|
222 |
else:
|
223 |
+
# Changed to paragraph style
|
224 |
+
terms_list = []
|
225 |
+
for term, _ in sorted_representation: # For binary, weight is always 1, so no need to display
|
226 |
+
terms_list.append(f"**{term}**")
|
227 |
+
formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
|
228 |
|
229 |
info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" # Changed title
|
230 |
info_output += f"Total activated terms: {len(indices)}\n"
|
|
|
308 |
# This function remains unchanged as it's a generic formatter for any sparse vector.
|
309 |
def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
|
310 |
if splade_vector is None:
|
311 |
+
return "Failed to generate vector.", ""
|
312 |
|
313 |
indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
|
314 |
if not isinstance(indices, list):
|
|
|
332 |
else:
|
333 |
sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
|
334 |
|
335 |
+
formatted_output = "" # Removed initial newline to allow control outside
|
336 |
if not sorted_representation:
|
337 |
formatted_output += "No significant terms found.\n"
|
338 |
else:
|
339 |
+
terms_list = []
|
340 |
for i, (term, weight) in enumerate(sorted_representation):
|
341 |
+
# Limit display for very long lists, but ensure it's still a paragraph if cut
|
342 |
+
if i >= 50:
|
343 |
+
terms_list.append(f"...and {len(sorted_representation) - 50} more terms.")
|
344 |
break
|
345 |
if is_binary:
|
346 |
+
terms_list.append(f"**{term}**")
|
347 |
else:
|
348 |
+
terms_list.append(f"**{term}**: {weight:.4f}")
|
349 |
+
formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
|
350 |
|
351 |
+
info_output = f"Total non-zero terms: {len(indices)}\n"
|
352 |
info_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n"
|
353 |
|
354 |
return formatted_output, info_output # Now returns two strings
|
|
|
384 |
# and to ensure .item() works reliably for conversion to float.
|
385 |
dot_product = float(torch.dot(query_vector.cpu(), doc_vector.cpu()).item())
|
386 |
|
387 |
+
# Format representations - these functions now return two strings (main_output, info_output)
|
|
|
388 |
query_main_rep_str, query_info_str = format_sparse_vector_output(query_vector, query_tokenizer, query_is_binary)
|
389 |
doc_main_rep_str, doc_info_str = format_sparse_vector_output(doc_vector, doc_tokenizer, doc_is_binary)
|
390 |
|
391 |
|
392 |
+
# Combine output into a single string for the Markdown component
|
|
|
|
|
|
|
|
|
|
|
|
|
393 |
full_output = f"### Dot Product Score: {dot_product:.6f}\n\n"
|
394 |
full_output += "---\n\n"
|
395 |
+
|
396 |
+
# Query Representation
|
397 |
+
full_output += f"Query Representation ({query_model_name_display}):\n\n"
|
398 |
+
full_output += query_main_rep_str + "\n\n" + query_info_str # Added an extra newline for better spacing
|
399 |
+
full_output += "\n\n---\n\n" # Separator
|
400 |
+
|
401 |
+
# Document Representation
|
402 |
+
full_output += f"Document Representation ({doc_model_name_display}):\n\n"
|
403 |
+
full_output += doc_main_rep_str + "\n\n" + doc_info_str # Added an extra newline for better spacing
|
404 |
|
405 |
return full_output
|
406 |
|