jvamvas commited on
Commit
a35d485
·
1 Parent(s): e8950c7

Initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ gsw_tokenizer/sentencepiece.bpe.model
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import utils
3
+ import os
4
+
5
+ # Create a custom theme
6
+ theme = gr.themes.Base(
7
+ text_size="lg",
8
+ radius_size="none",
9
+ font=[gr.themes.GoogleFont('Source Sans 3'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
10
+ )
11
+
12
+ # Load tokenizers only once during development
13
+ if gr.NO_RELOAD:
14
+ print("Loading tokenizers...")
15
+ all_tokenizers = utils.load_tokenizers()
16
+ all_tokenizer_names = list(all_tokenizers.keys())
17
+ print("Tokenizers loaded!")
18
+
19
+ def read_svg_file(name: str) -> str:
20
+ """Read SVG file content."""
21
+ icon_map = {
22
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct": "llama.svg",
23
+ "deepseek-ai/DeepSeek-V3-0324": "deepseek.svg",
24
+ "ZurichNLP/swissbert": "swissbert.svg",
25
+ "mistralai/Mistral-Nemo-Instruct-2407": "mistral.svg",
26
+ "google/gemma-3-27b-it": "gemma.svg",
27
+ "gpt-4o": "chatgpt.svg"
28
+ }
29
+ icon_path = os.path.join("icons", icon_map.get(name, "chatgpt.svg"))
30
+ try:
31
+ with open(icon_path, 'r') as f:
32
+ return f.read()
33
+ except Exception as e:
34
+ print(f"Error reading SVG file {icon_path}: {e}")
35
+ return ""
36
+
37
+ def get_model_icon(name: str) -> str:
38
+ """Get the HTML for the model icon."""
39
+ # Skip icons for collapsed models
40
+ if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
41
+ return ""
42
+
43
+ svg_content = read_svg_file(name)
44
+ if svg_content:
45
+ # Add viewBox and preserve aspect ratio to the SVG element
46
+ svg_content = svg_content.replace('<svg', '<svg preserveAspectRatio="xMidYMid meet" style="height: 24px; width: 24px;"')
47
+ # Wrap in a container that maintains aspect ratio
48
+ return f'<div style="display: inline-block; vertical-align: middle; margin-right: 8px; height: 24px; width: 24px; overflow: hidden;">{svg_content}</div>'
49
+ return ""
50
+
51
+ def process_text(text):
52
+ """Process the input text and return visualizations for all tokenizers."""
53
+ # Use the pre-loaded tokenizers
54
+ visualizations = utils.visualize_tokens(text, all_tokenizers)
55
+ return list(visualizations.values()) + [gr.update(visible=True)]
56
+
57
+ # Create the Gradio interface
58
+ with gr.Blocks(title="Tokens matter.", theme=theme, css="""
59
+ .tokenizer-panel > div { background: var(--input-background-fill); }
60
+ .no-padding { padding: 0 !important; }
61
+ .form { border: 0 !important; }
62
+ .html-container { line-height: 2em; !important; }
63
+ .pending { opacity: 1; }
64
+ """) as demo:
65
+ gr.Markdown("# Tokens matter.")
66
+
67
+ with gr.Row():
68
+ # Left column for inputs
69
+ with gr.Column(scale=1):
70
+ input_text = gr.Textbox(
71
+ label="Input Text:",
72
+ placeholder="Enter text to tokenize ...",
73
+ value="Als Zürcher bini nöd so Fan vom FC Basel.",
74
+ lines=3,
75
+ elem_classes="no-padding",
76
+ interactive=True,
77
+ every=True, # This enables real-time updates
78
+ )
79
+
80
+ # Right column for outputs
81
+ with gr.Column(scale=2):
82
+ # Create output boxes for main tokenizers
83
+ main_output_boxes = []
84
+ more_output_boxes = []
85
+
86
+ # Create 2x2 grid for main tokenizers
87
+ with gr.Row():
88
+ with gr.Column():
89
+ for name in all_tokenizer_names[:2]:
90
+ if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
91
+ continue
92
+ display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
93
+ with gr.Group(elem_classes="tokenizer-panel"):
94
+ gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span style="font-weight: bold;">{display_name}</span></div>')
95
+ box = gr.HTML()
96
+ main_output_boxes.append(box)
97
+ with gr.Column():
98
+ for name in all_tokenizer_names[2:4]:
99
+ if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
100
+ continue
101
+ display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
102
+ with gr.Group(elem_classes="tokenizer-panel"):
103
+ gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span style="font-weight: bold;">{display_name}</span></div>')
104
+ box = gr.HTML()
105
+ main_output_boxes.append(box)
106
+
107
+ # Display more tokenizers in accordion
108
+ more_models = gr.Accordion("More Models", open=False, visible=False)
109
+ with more_models:
110
+ for name in all_tokenizer_names:
111
+ if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
112
+ display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
113
+ with gr.Group(elem_classes="tokenizer-panel"):
114
+ gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span style="font-weight: bold;">{display_name}</span></div>')
115
+ box = gr.HTML()
116
+ more_output_boxes.append(box)
117
+
118
+ all_outputs = main_output_boxes + more_output_boxes + [more_models]
119
+
120
+ # Use change event for real-time updates
121
+ input_text.change(
122
+ fn=process_text,
123
+ inputs=[input_text],
124
+ outputs=all_outputs,
125
+ show_progress="hidden",
126
+ )
127
+
128
+ # Add examples
129
+ gr.Examples(
130
+ examples=[
131
+ ["Als Zürcher bini nöd so Fan vom FC Basel."],
132
+ ["Als Zürcher bin ich nicht sonderlich Fan des FC Basel."],
133
+ ["En tant que Zurichois, je ne suis pas un grand fan du FC Bâle."],
134
+ ["Come Zurighese, non sono un grande fan del FC Basilea."],
135
+ ["Sco Turitgais na sun jau betg in grond fan da l'FC Basilea."],
136
+ ["As a Zurich resident, I am not a big fan of FC Basel."],
137
+ ],
138
+ inputs=input_text
139
+ )
140
+
141
+ if __name__ == "__main__":
142
+ demo.launch()
gsw_tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": true,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": true,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": true,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": "<pad>",
31
+ "sep_token": {
32
+ "content": "</s>",
33
+ "lstrip": true,
34
+ "normalized": true,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ },
38
+ "unk_token": "<unk>"
39
+ }
40
+
gsw_tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "__type": "AddedToken",
7
+ "content": "<mask>",
8
+ "lstrip": true,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "model_max_length": 512,
14
+ "pad_token": "<pad>",
15
+ "sep_token": "</s>",
16
+ "sp_model_kwargs": {},
17
+ "tokenizer_class": "XLMRobertaTokenizer",
18
+ "unk_token": "<unk>"
19
+ }
icons/chatgpt.svg ADDED
icons/deepseek.svg ADDED
icons/llama.svg ADDED
icons/swissbert.svg ADDED
icons/swissbert_v0.svg ADDED
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio==5.29.0
2
+ transformers[sentencepiece]==4.51.3
3
+ tiktoken==0.9.0
tests/test_utils.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from utils import load_hf_tokenizer, load_openai_tokenizer, load_tokenizers, tokenize, load_gsw_tokenizer
3
+
4
+ class TestTokenizerLoading(unittest.TestCase):
5
+ def test_load_hf_tokenizer(self):
6
+ """Test loading a single HuggingFace tokenizer."""
7
+ name, tokenizer = load_hf_tokenizer("meta-llama/Llama-4-Scout-17B-16E-Instruct")
8
+ self.assertEqual(name, "meta-llama/Llama-4-Scout-17B-16E-Instruct")
9
+ self.assertIsNotNone(tokenizer)
10
+
11
+ def test_load_openai_tokenizer(self):
12
+ """Test loading a single OpenAI tokenizer."""
13
+ name, tokenizer = load_openai_tokenizer("gpt-4o")
14
+ self.assertEqual(name, "gpt-4o")
15
+ self.assertIsNotNone(tokenizer)
16
+
17
+ def test_load_tokenizers(self):
18
+ """Test loading all tokenizers."""
19
+ tokenizers = load_tokenizers()
20
+
21
+ # Check that we have the expected number of tokenizers
22
+ expected_count = 6 # 5 HF + 1 OpenAI
23
+ self.assertEqual(len(tokenizers), expected_count)
24
+
25
+ # Check that all expected tokenizers are present
26
+ expected_names = {
27
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct",
28
+ "deepseek-ai/DeepSeek-V3-0324",
29
+ "ZurichNLP/swissbert",
30
+ "mistralai/Mistral-Nemo-Instruct-2407",
31
+ "google/gemma-3-27b-it",
32
+ "gpt-4o"
33
+ }
34
+ self.assertEqual(set(tokenizers.keys()), expected_names)
35
+
36
+ # Check that all tokenizers are valid
37
+ for name, tokenizer in tokenizers.items():
38
+ self.assertIsNotNone(tokenizer)
39
+
40
+ def test_load_gsw_tokenizer(self):
41
+ """Test loading the Swiss German tokenizer from local files."""
42
+ name, tokenizer = load_gsw_tokenizer()
43
+ self.assertEqual(name, "swissbert-gsw")
44
+ self.assertIsNotNone(tokenizer)
45
+
46
+ # Test basic tokenization functionality
47
+ test_text = "nöd"
48
+ tokens = tokenize(test_text, tokenizer)
49
+ self.assertIsInstance(tokens, list)
50
+ self.assertTrue(all(isinstance(t, str) for t in tokens))
51
+ self.assertTrue(len(tokens) > 0)
52
+
53
+ class TestTokenizerFunctionality(unittest.TestCase):
54
+ def setUp(self):
55
+ """Set up tokenizers for testing."""
56
+ self.tokenizers = load_tokenizers()
57
+ self.test_text = "Dies ist ein Test."
58
+
59
+ def test_tokenize_llama(self):
60
+ """Test tokenization with Llama tokenizer."""
61
+ tokenizer = self.tokenizers["meta-llama/Llama-4-Scout-17B-16E-Instruct"]
62
+ tokens = tokenize(self.test_text, tokenizer)
63
+ self.assertIsInstance(tokens, list)
64
+ self.assertTrue(all(isinstance(t, str) for t in tokens))
65
+ self.assertTrue(len(tokens) > 0)
66
+ print(tokens)
67
+
68
+ def test_tokenize_deepseek(self):
69
+ """Test tokenization with DeepSeek tokenizer."""
70
+ tokenizer = self.tokenizers["deepseek-ai/DeepSeek-V3-0324"]
71
+ tokens = tokenize(self.test_text, tokenizer)
72
+ self.assertIsInstance(tokens, list)
73
+ self.assertTrue(all(isinstance(t, str) for t in tokens))
74
+ self.assertTrue(len(tokens) > 0)
75
+ print(tokens)
76
+
77
+ def test_tokenize_swissbert(self):
78
+ """Test tokenization with SwissBERT tokenizer."""
79
+ tokenizer = self.tokenizers["ZurichNLP/swissbert"]
80
+ tokens = tokenize(self.test_text, tokenizer)
81
+ self.assertIsInstance(tokens, list)
82
+ self.assertTrue(all(isinstance(t, str) for t in tokens))
83
+ self.assertTrue(len(tokens) > 0)
84
+ print(tokens)
85
+
86
+ def test_tokenize_gpt4(self):
87
+ """Test tokenization with GPT-4 tokenizer."""
88
+ tokenizer = self.tokenizers["gpt-4o"]
89
+ tokens = tokenize(self.test_text, tokenizer)
90
+ self.assertIsInstance(tokens, list)
91
+ self.assertTrue(all(isinstance(t, str) for t in tokens))
92
+ self.assertTrue(len(tokens) > 0)
93
+ print(tokens)
94
+
95
+ def test_tokenize_swissbert_comparison(self):
96
+ """Test that SwissBERT tokenization compares both tokenizers and returns shorter result."""
97
+ tokenizer = self.tokenizers["ZurichNLP/swissbert"]
98
+
99
+ # Test with a Swiss German word
100
+ test_text = "nöd"
101
+ tokens = tokenize(test_text, tokenizer)
102
+
103
+ # Verify we get a valid tokenization
104
+ self.assertIsInstance(tokens, list)
105
+ self.assertTrue(all(isinstance(t, str) for t in tokens))
106
+ self.assertTrue(len(tokens) > 0)
107
+
108
+ # Get both tokenizations directly to verify comparison
109
+ _, gsw_tokenizer = load_gsw_tokenizer()
110
+ swissbert_tokens = tokenize(test_text, tokenizer)
111
+ gsw_tokens = tokenize(test_text, gsw_tokenizer)
112
+
113
+ # Verify that the returned tokenization is the shorter one
114
+ expected_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens
115
+ self.assertEqual(tokens, expected_tokens)
116
+
117
+ def test_tokenize_mistral(self):
118
+ """Test tokenization with Mistral NeMo tokenizer."""
119
+ tokenizer = self.tokenizers["mistralai/Mistral-Nemo-Instruct-2407"]
120
+ tokens = tokenize(self.test_text, tokenizer)
121
+ self.assertIsInstance(tokens, list)
122
+ self.assertTrue(all(isinstance(t, str) for t in tokens))
123
+ self.assertTrue(len(tokens) > 0)
124
+ print(tokens)
125
+
126
+ def test_tokenize_gemma(self):
127
+ """Test tokenization with Gemma 3 tokenizer."""
128
+ tokenizer = self.tokenizers["google/gemma-3-27b-it"]
129
+ tokens = tokenize(self.test_text, tokenizer)
130
+ self.assertIsInstance(tokens, list)
131
+ self.assertTrue(all(isinstance(t, str) for t in tokens))
132
+ self.assertTrue(len(tokens) > 0)
133
+ print(tokens)
134
+
135
+ if __name__ == '__main__':
136
+ unittest.main()
theme.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import gradio as gr
2
+
3
+ gr.themes.builder()
utils.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Tuple
2
+ from pathlib import Path
3
+
4
+ from transformers import AutoTokenizer
5
+ import tiktoken
6
+
7
+ # UZH color palette
8
+ UZH_COLORS = [
9
+ "#BACBFF", # UZH Blue V1
10
+ "#DBF4F9", # UZH Cyan V1
11
+ "#ECF6D6", # UZH Apple V1
12
+ "#FFF4DA", # UZH Gold V1
13
+ "#FFDBCC", # UZH Orange V1
14
+ "#FBC6D4", # UZH Berry V1
15
+ "#C2C2C2", # UZH Grey V1
16
+ "#FAFAFA", # UZH Light Grey V1
17
+ "#7596FF", # UZH Blue V2
18
+ "#B7E9F4", # UZH Cyan V2
19
+ "#DBEDAD", # UZH Apple V2
20
+ "#FFE9B5", # UZH Gold V2
21
+ "#FEB799", # UZH Orange V2
22
+ "#F78CAA", # UZH Berry V2
23
+ "#A3A3A3", # UZH Grey V2
24
+ "#EFEFEF", # UZH Light Grey V2
25
+ ]
26
+
27
+ def load_hf_tokenizer(name: str) -> Tuple[str, object]:
28
+ """
29
+ Load a single HuggingFace tokenizer.
30
+
31
+ Args:
32
+ name: The name of the tokenizer to load
33
+
34
+ Returns:
35
+ Tuple of (tokenizer_name, tokenizer_object)
36
+ """
37
+ try:
38
+ tokenizer = AutoTokenizer.from_pretrained(
39
+ name,
40
+ use_fast=True,
41
+ model_max_length=1000000,
42
+ clean_up_tokenization_spaces=True,
43
+ legacy=False
44
+ )
45
+ except Exception as e:
46
+ tokenizer = AutoTokenizer.from_pretrained(
47
+ name,
48
+ model_max_length=1000000,
49
+ clean_up_tokenization_spaces=True,
50
+ legacy=False
51
+ )
52
+ return name, tokenizer
53
+
54
+ def load_openai_tokenizer(name: str) -> Tuple[str, object]:
55
+ """
56
+ Load a single OpenAI tokenizer.
57
+
58
+ Args:
59
+ name: The name of the tokenizer to load
60
+
61
+ Returns:
62
+ Tuple of (tokenizer_name, tokenizer_object)
63
+ """
64
+ return name, tiktoken.encoding_for_model(name)
65
+
66
+ def load_gsw_tokenizer() -> Tuple[str, object]:
67
+ """
68
+ Load a Swiss German (GSW) tokenizer from local vocabulary files in gsw_tokenizer directory.
69
+
70
+ Returns:
71
+ Tuple of (tokenizer_name, tokenizer_object)
72
+ """
73
+ tokenizer_path = Path(__file__).parent / "gsw_tokenizer"
74
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
75
+ return "swissbert-gsw", tokenizer
76
+
77
+ def load_tokenizers() -> Dict[str, object]:
78
+ """
79
+ Load all tokenizers.
80
+
81
+ Returns:
82
+ Dictionary mapping tokenizer names to tokenizer objects
83
+ """
84
+ tokenizers = {}
85
+
86
+ # Load OpenAI tokenizers first
87
+ openai_names = ["gpt-4o"]
88
+ for name in openai_names:
89
+ tokenizer_name, tokenizer = load_openai_tokenizer(name)
90
+ tokenizers[tokenizer_name] = tokenizer
91
+
92
+ # Load HuggingFace tokenizers in specified order
93
+ hf_names = [
94
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct",
95
+ "deepseek-ai/DeepSeek-V3-0324",
96
+ "ZurichNLP/swissbert",
97
+ "google/gemma-3-27b-it",
98
+ "mistralai/Mistral-Nemo-Instruct-2407",
99
+ "CohereLabs/aya-expanse-8b",
100
+ ]
101
+ for name in hf_names:
102
+ tokenizer_name, tokenizer = load_hf_tokenizer(name)
103
+ tokenizers[tokenizer_name] = tokenizer
104
+
105
+ return tokenizers
106
+
107
+ # Mapping of model names to display names
108
+ MODEL_DISPLAY_NAMES = {
109
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct": "Llama 4",
110
+ "deepseek-ai/DeepSeek-V3-0324": "DeepSeek V3",
111
+ "ZurichNLP/swissbert": "SwissBERT 🇨🇭",
112
+ "mistralai/Mistral-Nemo-Instruct-2407": "Mistral NeMo",
113
+ "google/gemma-3-27b-it": "Gemma 3",
114
+ "gpt-4o": "ChatGPT (gpt-4o)",
115
+ "CohereLabs/aya-expanse-8b": "Aya Expanse"
116
+ }
117
+
118
+ def tokenize(s: str, tokenizer) -> List[str]:
119
+ """
120
+ Tokenize a string using any tokenizer from load_hf_tokenizers() or load_openai_tokenizers().
121
+ For SwissBERT tokenizer, compares both SwissBERT and SwissBERT-GSW tokenizations and returns the shorter one.
122
+
123
+ Args:
124
+ s: The string to tokenize
125
+ tokenizer: A tokenizer from load_hf_tokenizers() or load_openai_tokenizers()
126
+
127
+ Returns:
128
+ A list of tokens, with special tokens removed and any tail token markers (## or @@) removed
129
+ """
130
+ # Special handling for SwissBERT tokenizer
131
+ if hasattr(tokenizer, "name_or_path") and "swissbert" in tokenizer.name_or_path.lower():
132
+ # Get SwissBERT-GSW tokenizer
133
+ _, gsw_tokenizer = load_gsw_tokenizer()
134
+
135
+ # Get tokenizations from both tokenizers§
136
+ swissbert_tokens = _tokenize_with_tokenizer(s, tokenizer)
137
+ gsw_tokens = _tokenize_with_tokenizer(s, gsw_tokenizer)
138
+
139
+ # Return the shorter tokenization
140
+ shorter_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens
141
+ if len(shorter_tokens) > 0 and shorter_tokens[0].startswith(" "):
142
+ shorter_tokens[0] = shorter_tokens[0][1:]
143
+ return shorter_tokens
144
+
145
+ return _tokenize_with_tokenizer(s, tokenizer)
146
+
147
+ def _tokenize_with_tokenizer(s: str, tokenizer) -> List[str]:
148
+ """
149
+ Internal helper function to tokenize a string with a given tokenizer.
150
+
151
+ Args:
152
+ s: The string to tokenize
153
+ tokenizer: A tokenizer object
154
+
155
+ Returns:
156
+ A list of tokens, with special tokens removed and any tail token markers (## or @@) removed
157
+ """
158
+ if hasattr(tokenizer, "tokenize"):
159
+ encoded = tokenizer.encode(s, add_special_tokens=False)
160
+ if hasattr(tokenizer, "name_or_path") and any(name in tokenizer.name_or_path.lower() for name in ["llama", "deepseek", "mistral", "aya"]):
161
+ tokens = [tokenizer.decode([token_id], skip_special_tokens=False) for token_id in encoded]
162
+ else:
163
+ tokens = tokenizer.convert_ids_to_tokens(encoded)
164
+
165
+ filtered_tokens = []
166
+ for t in tokens:
167
+ if t.startswith("<") or t.startswith("["):
168
+ continue
169
+ elif "Ġ" in t:
170
+ filtered_tokens.append(t.replace("Ġ", " "))
171
+ elif "Ċ" in t:
172
+ filtered_tokens.append(t.replace("Ċ", " "))
173
+ elif t.startswith("▁"):
174
+ filtered_tokens.append(" " + t[1:])
175
+ else:
176
+ filtered_tokens.append(t)
177
+
178
+ return [t.rstrip("##").rstrip("@@") for t in filtered_tokens]
179
+
180
+ elif hasattr(tokenizer, "encode"):
181
+ token_ids = tokenizer.encode(s)
182
+ return [tokenizer.decode([token_id]) for token_id in token_ids]
183
+
184
+ else:
185
+ raise ValueError("Unsupported tokenizer type")
186
+
187
+ def get_uzh_color(index):
188
+ """Get a color from the UZH color palette based on index."""
189
+ return UZH_COLORS[index % len(UZH_COLORS)]
190
+
191
+ def visualize_tokens(text: str, tokenizers: Dict[str, object]):
192
+ """
193
+ Tokenize text with each tokenizer and visualize the tokens with colors.
194
+ Colors are consistent across tokenizers for the same token sequences.
195
+ Colors are deterministic based on token content.
196
+
197
+ Args:
198
+ text: The input text to tokenize
199
+ tokenizers: Dictionary of tokenizers
200
+
201
+ Returns:
202
+ Dictionary mapping tokenizer names to HTML visualizations
203
+ """
204
+ results = {}
205
+
206
+ # First pass: collect all unique tokens across all tokenizers
207
+ all_tokens = set()
208
+ for tokenizer in tokenizers.values():
209
+ tokens = tokenize(text, tokenizer)
210
+ all_tokens.update(tokens)
211
+
212
+ # Generate colors for all unique tokens using hash-based approach
213
+ token_colors = {}
214
+ for token in all_tokens:
215
+ # Use hash of token to get a deterministic index
216
+ token_hash = hash(token)
217
+ # Ensure positive index and wrap around to color list length
218
+ index = abs(token_hash) % len(UZH_COLORS)
219
+ token_colors[token] = get_uzh_color(index)
220
+
221
+ # Second pass: create visualizations using the consistent colors
222
+ for name, tokenizer in tokenizers.items():
223
+ tokens = tokenize(text, tokenizer)
224
+
225
+ # Create a colored visualization
226
+ html = ""
227
+
228
+ # Build the HTML with colored spans for each token
229
+ for token in tokens:
230
+ color = token_colors[token]
231
+ html += f'<span style="background-color: {color}; padding: 2px; margin: 1px; border-radius: 3px;">{token}</span>'
232
+
233
+ results[name] = html
234
+
235
+ return results