SiddharthAK commited on
Commit
da0c779
·
verified ·
1 Parent(s): 5bf8193

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -27
app.py CHANGED
@@ -113,12 +113,15 @@ def get_splade_cocondenser_representation(text):
113
 
114
  sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
115
 
116
- formatted_output = "MLM encoder (SPLADE-cocondenser-distil):\n"
117
  if not sorted_representation:
118
  formatted_output += "No significant terms found for this input.\n"
119
  else:
 
 
120
  for term, weight in sorted_representation:
121
- formatted_output += f"- **{term}**: {weight:.4f}\n"
 
122
 
123
  info_output = f"--- Sparse Vector Info ---\n"
124
  info_output += f"Total non-zero terms in vector: {len(indices)}\n"
@@ -168,12 +171,15 @@ def get_splade_lexical_representation(text):
168
 
169
  sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
170
 
171
- formatted_output = "SPLADE-v3-Lexical Representation (Weighting):\n"
172
  if not sorted_representation:
173
  formatted_output += "No significant terms found for this input.\n"
174
  else:
 
 
175
  for term, weight in sorted_representation:
176
- formatted_output += f"- **{term}**: {weight:.4f}\n"
 
177
 
178
  info_output = f"--- Raw Sparse Vector Info ---\n"
179
  info_output += f"Total non-zero terms in vector: {len(indices)}\n"
@@ -210,15 +216,15 @@ def get_splade_doc_representation(text):
210
 
211
  sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
212
 
213
- formatted_output = "Binary Bag-of-Words Representation:\n" # Changed title
214
  if not sorted_representation:
215
  formatted_output += "No significant terms found for this input.\n"
216
  else:
217
- for i, (term, _) in enumerate(sorted_representation):
218
- if i >= 50: # Limit display for very long lists
219
- formatted_output += f"...and {len(sorted_representation) - 50} more terms.\n"
220
- break
221
- formatted_output += f"- **{term}**\n"
222
 
223
  info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" # Changed title
224
  info_output += f"Total activated terms: {len(indices)}\n"
@@ -302,7 +308,7 @@ def get_splade_doc_vector(text):
302
  # This function remains unchanged as it's a generic formatter for any sparse vector.
303
  def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
304
  if splade_vector is None:
305
- return "Failed to generate vector."
306
 
307
  indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
308
  if not isinstance(indices, list):
@@ -326,20 +332,23 @@ def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
326
  else:
327
  sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
328
 
329
- formatted_output = ""
330
  if not sorted_representation:
331
  formatted_output += "No significant terms found.\n"
332
  else:
 
333
  for i, (term, weight) in enumerate(sorted_representation):
334
- if i >= 50 and is_binary: # Limit display for very long binary lists
335
- formatted_output += f"...and {len(sorted_representation) - 50} more terms.\n"
 
336
  break
337
  if is_binary:
338
- formatted_output += f"- **{term}**\n"
339
  else:
340
- formatted_output += f"- **{term}**: {weight:.4f}\n"
 
341
 
342
- info_output = f"\nTotal non-zero terms: {len(indices)}\n"
343
  info_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n"
344
 
345
  return formatted_output, info_output # Now returns two strings
@@ -375,22 +384,23 @@ def calculate_dot_product_and_representations_independent(query_model_choice, do
375
  # and to ensure .item() works reliably for conversion to float.
376
  dot_product = float(torch.dot(query_vector.cpu(), doc_vector.cpu()).item())
377
 
378
- # Format representations
379
- # These functions now return two strings (main_output, info_output)
380
  query_main_rep_str, query_info_str = format_sparse_vector_output(query_vector, query_tokenizer, query_is_binary)
381
  doc_main_rep_str, doc_info_str = format_sparse_vector_output(doc_vector, doc_tokenizer, doc_is_binary)
382
 
383
 
384
- query_rep_str = f"Query Representation ({query_model_name_display}):\n"
385
- query_rep_str += query_main_rep_str + "\n" + query_info_str
386
-
387
- doc_rep_str = f"Document Representation ({doc_model_name_display}):\n"
388
- doc_rep_str += doc_main_rep_str + "\n" + doc_info_str
389
-
390
- # Combine output
391
  full_output = f"### Dot Product Score: {dot_product:.6f}\n\n"
392
  full_output += "---\n\n"
393
- full_output += f"{query_rep_str}\n\n---\n\n{doc_rep_str}"
 
 
 
 
 
 
 
 
394
 
395
  return full_output
396
 
 
113
 
114
  sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
115
 
116
+ formatted_output = "MLM encoder (SPLADE-cocondenser-distil):\n\n" # Added newline
117
  if not sorted_representation:
118
  formatted_output += "No significant terms found for this input.\n"
119
  else:
120
+ # Changed to paragraph style
121
+ terms_list = []
122
  for term, weight in sorted_representation:
123
+ terms_list.append(f"**{term}**: {weight:.4f}")
124
+ formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
125
 
126
  info_output = f"--- Sparse Vector Info ---\n"
127
  info_output += f"Total non-zero terms in vector: {len(indices)}\n"
 
171
 
172
  sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
173
 
174
+ formatted_output = "SPLADE-v3-Lexical Representation (Weighting):\n\n" # Added newline
175
  if not sorted_representation:
176
  formatted_output += "No significant terms found for this input.\n"
177
  else:
178
+ # Changed to paragraph style
179
+ terms_list = []
180
  for term, weight in sorted_representation:
181
+ terms_list.append(f"**{term}**: {weight:.4f}")
182
+ formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
183
 
184
  info_output = f"--- Raw Sparse Vector Info ---\n"
185
  info_output += f"Total non-zero terms in vector: {len(indices)}\n"
 
216
 
217
  sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[0]) # Sort alphabetically for clarity
218
 
219
+ formatted_output = "Binary Bag-of-Words Representation:\n\n" # Changed title, added newline
220
  if not sorted_representation:
221
  formatted_output += "No significant terms found for this input.\n"
222
  else:
223
+ # Changed to paragraph style
224
+ terms_list = []
225
+ for term, _ in sorted_representation: # For binary, weight is always 1, so no need to display
226
+ terms_list.append(f"**{term}**")
227
+ formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
228
 
229
  info_output = f"--- Raw Binary Bag-of-Words Vector Info ---\n" # Changed title
230
  info_output += f"Total activated terms: {len(indices)}\n"
 
308
  # This function remains unchanged as it's a generic formatter for any sparse vector.
309
  def format_sparse_vector_output(splade_vector, tokenizer, is_binary=False):
310
  if splade_vector is None:
311
+ return "Failed to generate vector.", ""
312
 
313
  indices = torch.nonzero(splade_vector).squeeze().cpu().tolist()
314
  if not isinstance(indices, list):
 
332
  else:
333
  sorted_representation = sorted(meaningful_tokens.items(), key=lambda item: item[1], reverse=True)
334
 
335
+ formatted_output = "" # Removed initial newline to allow control outside
336
  if not sorted_representation:
337
  formatted_output += "No significant terms found.\n"
338
  else:
339
+ terms_list = []
340
  for i, (term, weight) in enumerate(sorted_representation):
341
+ # Limit display for very long lists, but ensure it's still a paragraph if cut
342
+ if i >= 50:
343
+ terms_list.append(f"...and {len(sorted_representation) - 50} more terms.")
344
  break
345
  if is_binary:
346
+ terms_list.append(f"**{term}**")
347
  else:
348
+ terms_list.append(f"**{term}**: {weight:.4f}")
349
+ formatted_output += ", ".join(terms_list) + "." # Join with comma and space, end with period
350
 
351
+ info_output = f"Total non-zero terms: {len(indices)}\n"
352
  info_output += f"Sparsity: {1 - (len(indices) / tokenizer.vocab_size):.2%}\n"
353
 
354
  return formatted_output, info_output # Now returns two strings
 
384
  # and to ensure .item() works reliably for conversion to float.
385
  dot_product = float(torch.dot(query_vector.cpu(), doc_vector.cpu()).item())
386
 
387
+ # Format representations - these functions now return two strings (main_output, info_output)
 
388
  query_main_rep_str, query_info_str = format_sparse_vector_output(query_vector, query_tokenizer, query_is_binary)
389
  doc_main_rep_str, doc_info_str = format_sparse_vector_output(doc_vector, doc_tokenizer, doc_is_binary)
390
 
391
 
392
+ # Combine output into a single string for the Markdown component
 
 
 
 
 
 
393
  full_output = f"### Dot Product Score: {dot_product:.6f}\n\n"
394
  full_output += "---\n\n"
395
+
396
+ # Query Representation
397
+ full_output += f"Query Representation ({query_model_name_display}):\n\n"
398
+ full_output += query_main_rep_str + "\n\n" + query_info_str # Added an extra newline for better spacing
399
+ full_output += "\n\n---\n\n" # Separator
400
+
401
+ # Document Representation
402
+ full_output += f"Document Representation ({doc_model_name_display}):\n\n"
403
+ full_output += doc_main_rep_str + "\n\n" + doc_info_str # Added an extra newline for better spacing
404
 
405
  return full_output
406