lucalp commited on
Commit
5e19880
·
1 Parent(s): 9fefb2b

Better handling of spaces

Browse files
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -226,7 +226,7 @@ class BytelatentProcessor:
226
  # Boundary check, though loop structure should prevent out-of-bounds
227
  if current_byte_abs_idx < len(all_token_ids):
228
  label = token_to_patch_label[current_byte_abs_idx] if current_byte_abs_idx < len(token_to_patch_label) else "Error: Label Missing"
229
- display_text = f"{processed_char_text}-{j+1}".replace(" ", "_")
230
  highlighted_data.append((display_text, label))
231
  else: # Should ideally not be reached
232
  logging.error(f"Critical: Token index {current_byte_abs_idx} out of bounds for labeling.")
@@ -248,7 +248,7 @@ class BytelatentProcessor:
248
  except Exception:
249
  pass # Stick with the err_byte display_text
250
 
251
- highlighted_data.append((display_text.replace(" ", "_"), label))
252
  logging.warning(
253
  f"Token ID {problem_byte_id} at index {current_byte_abs_idx} "
254
  f"could not be part of a validly decoded character using iterative decode. Fallback: '{display_text}'."
@@ -321,7 +321,7 @@ class BytelatentProcessor:
321
 
322
  # Create highlighted text data
323
  _highlighted_data, patch_count = self._create_highlight_data(patch_lengths, tokens)
324
- ind_highlighted_data = [(text.replace("-1", ""), label) for text, label in _highlighted_data]
325
  grouped_data = defaultdict(str)
326
  for text, label in ind_highlighted_data:
327
  grouped_data[label] += text
 
226
  # Boundary check, though loop structure should prevent out-of-bounds
227
  if current_byte_abs_idx < len(all_token_ids):
228
  label = token_to_patch_label[current_byte_abs_idx] if current_byte_abs_idx < len(token_to_patch_label) else "Error: Label Missing"
229
+ display_text = f"{processed_char_text}-{j+1}"
230
  highlighted_data.append((display_text, label))
231
  else: # Should ideally not be reached
232
  logging.error(f"Critical: Token index {current_byte_abs_idx} out of bounds for labeling.")
 
248
  except Exception:
249
  pass # Stick with the err_byte display_text
250
 
251
+ highlighted_data.append((display_text, label))
252
  logging.warning(
253
  f"Token ID {problem_byte_id} at index {current_byte_abs_idx} "
254
  f"could not be part of a validly decoded character using iterative decode. Fallback: '{display_text}'."
 
321
 
322
  # Create highlighted text data
323
  _highlighted_data, patch_count = self._create_highlight_data(patch_lengths, tokens)
324
+ ind_highlighted_data = [(text.replace(" -1", "_-1").replace("-1", ""), label) for text, label in _highlighted_data]
325
  grouped_data = defaultdict(str)
326
  for text, label in ind_highlighted_data:
327
  grouped_data[label] += text