Spaces:

lucalp
/

blt-entropy-patcher

Running on Zero

App Files Files Community

lucalp commited on May 21

Commit

5e19880

1 Parent(s): 9fefb2b

Better handling of spaces

Browse files

Files changed (1) hide show

app.py +3 -3

app.py CHANGED Viewed

@@ -226,7 +226,7 @@ class BytelatentProcessor:
                     # Boundary check, though loop structure should prevent out-of-bounds
                     if current_byte_abs_idx < len(all_token_ids):
                         label = token_to_patch_label[current_byte_abs_idx] if current_byte_abs_idx < len(token_to_patch_label) else "Error: Label Missing"
-                        display_text = f"{processed_char_text}-{j+1}".replace(" ", "_")
                         highlighted_data.append((display_text, label))
                     else: # Should ideally not be reached
                         logging.error(f"Critical: Token index {current_byte_abs_idx} out of bounds for labeling.")
@@ -248,7 +248,7 @@ class BytelatentProcessor:
                 except Exception:
                     pass # Stick with the err_byte display_text
-                highlighted_data.append((display_text.replace(" ", "_"), label))
                 logging.warning(
                     f"Token ID {problem_byte_id} at index {current_byte_abs_idx} "
                     f"could not be part of a validly decoded character using iterative decode. Fallback: '{display_text}'."
@@ -321,7 +321,7 @@ class BytelatentProcessor:
             # Create highlighted text data
             _highlighted_data, patch_count = self._create_highlight_data(patch_lengths, tokens)
-            ind_highlighted_data = [(text.replace("-1", ""), label) for text, label in _highlighted_data]
             grouped_data = defaultdict(str)
             for text, label in ind_highlighted_data:
                 grouped_data[label] += text

                     # Boundary check, though loop structure should prevent out-of-bounds
                     if current_byte_abs_idx < len(all_token_ids):
                         label = token_to_patch_label[current_byte_abs_idx] if current_byte_abs_idx < len(token_to_patch_label) else "Error: Label Missing"
+                        display_text = f"{processed_char_text}-{j+1}"
                         highlighted_data.append((display_text, label))
                     else: # Should ideally not be reached
                         logging.error(f"Critical: Token index {current_byte_abs_idx} out of bounds for labeling.")
                 except Exception:
                     pass # Stick with the err_byte display_text
+                highlighted_data.append((display_text, label))
                 logging.warning(
                     f"Token ID {problem_byte_id} at index {current_byte_abs_idx} "
                     f"could not be part of a validly decoded character using iterative decode. Fallback: '{display_text}'."
             # Create highlighted text data
             _highlighted_data, patch_count = self._create_highlight_data(patch_lengths, tokens)
+            ind_highlighted_data = [(text.replace(" -1", "_-1").replace("-1", ""), label) for text, label in _highlighted_data]
             grouped_data = defaultdict(str)
             for text, label in ind_highlighted_data:
                 grouped_data[label] += text