Spaces:
Running
Running
Initial commit
Browse files- .gitignore +1 -0
- app.py +142 -0
- gsw_tokenizer/special_tokens_map.json +40 -0
- gsw_tokenizer/tokenizer_config.json +19 -0
- icons/chatgpt.svg +1 -0
- icons/deepseek.svg +42 -0
- icons/llama.svg +18 -0
- icons/swissbert.svg +43 -0
- icons/swissbert_v0.svg +525 -0
- requirements.txt +3 -0
- tests/test_utils.py +136 -0
- theme.py +3 -0
- utils.py +235 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
gsw_tokenizer/sentencepiece.bpe.model
|
app.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import utils
|
3 |
+
import os
|
4 |
+
|
5 |
+
# Create a custom theme
|
6 |
+
theme = gr.themes.Base(
|
7 |
+
text_size="lg",
|
8 |
+
radius_size="none",
|
9 |
+
font=[gr.themes.GoogleFont('Source Sans 3'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
|
10 |
+
)
|
11 |
+
|
12 |
+
# Load tokenizers only once during development
|
13 |
+
if gr.NO_RELOAD:
|
14 |
+
print("Loading tokenizers...")
|
15 |
+
all_tokenizers = utils.load_tokenizers()
|
16 |
+
all_tokenizer_names = list(all_tokenizers.keys())
|
17 |
+
print("Tokenizers loaded!")
|
18 |
+
|
19 |
+
def read_svg_file(name: str) -> str:
|
20 |
+
"""Read SVG file content."""
|
21 |
+
icon_map = {
|
22 |
+
"meta-llama/Llama-4-Scout-17B-16E-Instruct": "llama.svg",
|
23 |
+
"deepseek-ai/DeepSeek-V3-0324": "deepseek.svg",
|
24 |
+
"ZurichNLP/swissbert": "swissbert.svg",
|
25 |
+
"mistralai/Mistral-Nemo-Instruct-2407": "mistral.svg",
|
26 |
+
"google/gemma-3-27b-it": "gemma.svg",
|
27 |
+
"gpt-4o": "chatgpt.svg"
|
28 |
+
}
|
29 |
+
icon_path = os.path.join("icons", icon_map.get(name, "chatgpt.svg"))
|
30 |
+
try:
|
31 |
+
with open(icon_path, 'r') as f:
|
32 |
+
return f.read()
|
33 |
+
except Exception as e:
|
34 |
+
print(f"Error reading SVG file {icon_path}: {e}")
|
35 |
+
return ""
|
36 |
+
|
37 |
+
def get_model_icon(name: str) -> str:
|
38 |
+
"""Get the HTML for the model icon."""
|
39 |
+
# Skip icons for collapsed models
|
40 |
+
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
|
41 |
+
return ""
|
42 |
+
|
43 |
+
svg_content = read_svg_file(name)
|
44 |
+
if svg_content:
|
45 |
+
# Add viewBox and preserve aspect ratio to the SVG element
|
46 |
+
svg_content = svg_content.replace('<svg', '<svg preserveAspectRatio="xMidYMid meet" style="height: 24px; width: 24px;"')
|
47 |
+
# Wrap in a container that maintains aspect ratio
|
48 |
+
return f'<div style="display: inline-block; vertical-align: middle; margin-right: 8px; height: 24px; width: 24px; overflow: hidden;">{svg_content}</div>'
|
49 |
+
return ""
|
50 |
+
|
51 |
+
def process_text(text):
|
52 |
+
"""Process the input text and return visualizations for all tokenizers."""
|
53 |
+
# Use the pre-loaded tokenizers
|
54 |
+
visualizations = utils.visualize_tokens(text, all_tokenizers)
|
55 |
+
return list(visualizations.values()) + [gr.update(visible=True)]
|
56 |
+
|
57 |
+
# Create the Gradio interface
|
58 |
+
with gr.Blocks(title="Tokens matter.", theme=theme, css="""
|
59 |
+
.tokenizer-panel > div { background: var(--input-background-fill); }
|
60 |
+
.no-padding { padding: 0 !important; }
|
61 |
+
.form { border: 0 !important; }
|
62 |
+
.html-container { line-height: 2em; !important; }
|
63 |
+
.pending { opacity: 1; }
|
64 |
+
""") as demo:
|
65 |
+
gr.Markdown("# Tokens matter.")
|
66 |
+
|
67 |
+
with gr.Row():
|
68 |
+
# Left column for inputs
|
69 |
+
with gr.Column(scale=1):
|
70 |
+
input_text = gr.Textbox(
|
71 |
+
label="Input Text:",
|
72 |
+
placeholder="Enter text to tokenize ...",
|
73 |
+
value="Als Zürcher bini nöd so Fan vom FC Basel.",
|
74 |
+
lines=3,
|
75 |
+
elem_classes="no-padding",
|
76 |
+
interactive=True,
|
77 |
+
every=True, # This enables real-time updates
|
78 |
+
)
|
79 |
+
|
80 |
+
# Right column for outputs
|
81 |
+
with gr.Column(scale=2):
|
82 |
+
# Create output boxes for main tokenizers
|
83 |
+
main_output_boxes = []
|
84 |
+
more_output_boxes = []
|
85 |
+
|
86 |
+
# Create 2x2 grid for main tokenizers
|
87 |
+
with gr.Row():
|
88 |
+
with gr.Column():
|
89 |
+
for name in all_tokenizer_names[:2]:
|
90 |
+
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
|
91 |
+
continue
|
92 |
+
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
|
93 |
+
with gr.Group(elem_classes="tokenizer-panel"):
|
94 |
+
gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span style="font-weight: bold;">{display_name}</span></div>')
|
95 |
+
box = gr.HTML()
|
96 |
+
main_output_boxes.append(box)
|
97 |
+
with gr.Column():
|
98 |
+
for name in all_tokenizer_names[2:4]:
|
99 |
+
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
|
100 |
+
continue
|
101 |
+
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
|
102 |
+
with gr.Group(elem_classes="tokenizer-panel"):
|
103 |
+
gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span style="font-weight: bold;">{display_name}</span></div>')
|
104 |
+
box = gr.HTML()
|
105 |
+
main_output_boxes.append(box)
|
106 |
+
|
107 |
+
# Display more tokenizers in accordion
|
108 |
+
more_models = gr.Accordion("More Models", open=False, visible=False)
|
109 |
+
with more_models:
|
110 |
+
for name in all_tokenizer_names:
|
111 |
+
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
|
112 |
+
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
|
113 |
+
with gr.Group(elem_classes="tokenizer-panel"):
|
114 |
+
gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span style="font-weight: bold;">{display_name}</span></div>')
|
115 |
+
box = gr.HTML()
|
116 |
+
more_output_boxes.append(box)
|
117 |
+
|
118 |
+
all_outputs = main_output_boxes + more_output_boxes + [more_models]
|
119 |
+
|
120 |
+
# Use change event for real-time updates
|
121 |
+
input_text.change(
|
122 |
+
fn=process_text,
|
123 |
+
inputs=[input_text],
|
124 |
+
outputs=all_outputs,
|
125 |
+
show_progress="hidden",
|
126 |
+
)
|
127 |
+
|
128 |
+
# Add examples
|
129 |
+
gr.Examples(
|
130 |
+
examples=[
|
131 |
+
["Als Zürcher bini nöd so Fan vom FC Basel."],
|
132 |
+
["Als Zürcher bin ich nicht sonderlich Fan des FC Basel."],
|
133 |
+
["En tant que Zurichois, je ne suis pas un grand fan du FC Bâle."],
|
134 |
+
["Come Zurighese, non sono un grande fan del FC Basilea."],
|
135 |
+
["Sco Turitgais na sun jau betg in grond fan da l'FC Basilea."],
|
136 |
+
["As a Zurich resident, I am not a big fan of FC Basel."],
|
137 |
+
],
|
138 |
+
inputs=input_text
|
139 |
+
)
|
140 |
+
|
141 |
+
if __name__ == "__main__":
|
142 |
+
demo.launch()
|
gsw_tokenizer/special_tokens_map.json
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": true,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "<s>",
|
11 |
+
"lstrip": true,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "</s>",
|
18 |
+
"lstrip": true,
|
19 |
+
"normalized": true,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"mask_token": {
|
24 |
+
"content": "<mask>",
|
25 |
+
"lstrip": true,
|
26 |
+
"normalized": true,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"pad_token": "<pad>",
|
31 |
+
"sep_token": {
|
32 |
+
"content": "</s>",
|
33 |
+
"lstrip": true,
|
34 |
+
"normalized": true,
|
35 |
+
"rstrip": false,
|
36 |
+
"single_word": false
|
37 |
+
},
|
38 |
+
"unk_token": "<unk>"
|
39 |
+
}
|
40 |
+
|
gsw_tokenizer/tokenizer_config.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<s>",
|
3 |
+
"cls_token": "<s>",
|
4 |
+
"eos_token": "</s>",
|
5 |
+
"mask_token": {
|
6 |
+
"__type": "AddedToken",
|
7 |
+
"content": "<mask>",
|
8 |
+
"lstrip": true,
|
9 |
+
"normalized": true,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false
|
12 |
+
},
|
13 |
+
"model_max_length": 512,
|
14 |
+
"pad_token": "<pad>",
|
15 |
+
"sep_token": "</s>",
|
16 |
+
"sp_model_kwargs": {},
|
17 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
18 |
+
"unk_token": "<unk>"
|
19 |
+
}
|
icons/chatgpt.svg
ADDED
|
icons/deepseek.svg
ADDED
|
icons/llama.svg
ADDED
|
icons/swissbert.svg
ADDED
|
icons/swissbert_v0.svg
ADDED
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
gradio==5.29.0
|
2 |
+
transformers[sentencepiece]==4.51.3
|
3 |
+
tiktoken==0.9.0
|
tests/test_utils.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from utils import load_hf_tokenizer, load_openai_tokenizer, load_tokenizers, tokenize, load_gsw_tokenizer
|
3 |
+
|
4 |
+
class TestTokenizerLoading(unittest.TestCase):
|
5 |
+
def test_load_hf_tokenizer(self):
|
6 |
+
"""Test loading a single HuggingFace tokenizer."""
|
7 |
+
name, tokenizer = load_hf_tokenizer("meta-llama/Llama-4-Scout-17B-16E-Instruct")
|
8 |
+
self.assertEqual(name, "meta-llama/Llama-4-Scout-17B-16E-Instruct")
|
9 |
+
self.assertIsNotNone(tokenizer)
|
10 |
+
|
11 |
+
def test_load_openai_tokenizer(self):
|
12 |
+
"""Test loading a single OpenAI tokenizer."""
|
13 |
+
name, tokenizer = load_openai_tokenizer("gpt-4o")
|
14 |
+
self.assertEqual(name, "gpt-4o")
|
15 |
+
self.assertIsNotNone(tokenizer)
|
16 |
+
|
17 |
+
def test_load_tokenizers(self):
|
18 |
+
"""Test loading all tokenizers."""
|
19 |
+
tokenizers = load_tokenizers()
|
20 |
+
|
21 |
+
# Check that we have the expected number of tokenizers
|
22 |
+
expected_count = 6 # 5 HF + 1 OpenAI
|
23 |
+
self.assertEqual(len(tokenizers), expected_count)
|
24 |
+
|
25 |
+
# Check that all expected tokenizers are present
|
26 |
+
expected_names = {
|
27 |
+
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
28 |
+
"deepseek-ai/DeepSeek-V3-0324",
|
29 |
+
"ZurichNLP/swissbert",
|
30 |
+
"mistralai/Mistral-Nemo-Instruct-2407",
|
31 |
+
"google/gemma-3-27b-it",
|
32 |
+
"gpt-4o"
|
33 |
+
}
|
34 |
+
self.assertEqual(set(tokenizers.keys()), expected_names)
|
35 |
+
|
36 |
+
# Check that all tokenizers are valid
|
37 |
+
for name, tokenizer in tokenizers.items():
|
38 |
+
self.assertIsNotNone(tokenizer)
|
39 |
+
|
40 |
+
def test_load_gsw_tokenizer(self):
|
41 |
+
"""Test loading the Swiss German tokenizer from local files."""
|
42 |
+
name, tokenizer = load_gsw_tokenizer()
|
43 |
+
self.assertEqual(name, "swissbert-gsw")
|
44 |
+
self.assertIsNotNone(tokenizer)
|
45 |
+
|
46 |
+
# Test basic tokenization functionality
|
47 |
+
test_text = "nöd"
|
48 |
+
tokens = tokenize(test_text, tokenizer)
|
49 |
+
self.assertIsInstance(tokens, list)
|
50 |
+
self.assertTrue(all(isinstance(t, str) for t in tokens))
|
51 |
+
self.assertTrue(len(tokens) > 0)
|
52 |
+
|
53 |
+
class TestTokenizerFunctionality(unittest.TestCase):
|
54 |
+
def setUp(self):
|
55 |
+
"""Set up tokenizers for testing."""
|
56 |
+
self.tokenizers = load_tokenizers()
|
57 |
+
self.test_text = "Dies ist ein Test."
|
58 |
+
|
59 |
+
def test_tokenize_llama(self):
|
60 |
+
"""Test tokenization with Llama tokenizer."""
|
61 |
+
tokenizer = self.tokenizers["meta-llama/Llama-4-Scout-17B-16E-Instruct"]
|
62 |
+
tokens = tokenize(self.test_text, tokenizer)
|
63 |
+
self.assertIsInstance(tokens, list)
|
64 |
+
self.assertTrue(all(isinstance(t, str) for t in tokens))
|
65 |
+
self.assertTrue(len(tokens) > 0)
|
66 |
+
print(tokens)
|
67 |
+
|
68 |
+
def test_tokenize_deepseek(self):
|
69 |
+
"""Test tokenization with DeepSeek tokenizer."""
|
70 |
+
tokenizer = self.tokenizers["deepseek-ai/DeepSeek-V3-0324"]
|
71 |
+
tokens = tokenize(self.test_text, tokenizer)
|
72 |
+
self.assertIsInstance(tokens, list)
|
73 |
+
self.assertTrue(all(isinstance(t, str) for t in tokens))
|
74 |
+
self.assertTrue(len(tokens) > 0)
|
75 |
+
print(tokens)
|
76 |
+
|
77 |
+
def test_tokenize_swissbert(self):
|
78 |
+
"""Test tokenization with SwissBERT tokenizer."""
|
79 |
+
tokenizer = self.tokenizers["ZurichNLP/swissbert"]
|
80 |
+
tokens = tokenize(self.test_text, tokenizer)
|
81 |
+
self.assertIsInstance(tokens, list)
|
82 |
+
self.assertTrue(all(isinstance(t, str) for t in tokens))
|
83 |
+
self.assertTrue(len(tokens) > 0)
|
84 |
+
print(tokens)
|
85 |
+
|
86 |
+
def test_tokenize_gpt4(self):
|
87 |
+
"""Test tokenization with GPT-4 tokenizer."""
|
88 |
+
tokenizer = self.tokenizers["gpt-4o"]
|
89 |
+
tokens = tokenize(self.test_text, tokenizer)
|
90 |
+
self.assertIsInstance(tokens, list)
|
91 |
+
self.assertTrue(all(isinstance(t, str) for t in tokens))
|
92 |
+
self.assertTrue(len(tokens) > 0)
|
93 |
+
print(tokens)
|
94 |
+
|
95 |
+
def test_tokenize_swissbert_comparison(self):
|
96 |
+
"""Test that SwissBERT tokenization compares both tokenizers and returns shorter result."""
|
97 |
+
tokenizer = self.tokenizers["ZurichNLP/swissbert"]
|
98 |
+
|
99 |
+
# Test with a Swiss German word
|
100 |
+
test_text = "nöd"
|
101 |
+
tokens = tokenize(test_text, tokenizer)
|
102 |
+
|
103 |
+
# Verify we get a valid tokenization
|
104 |
+
self.assertIsInstance(tokens, list)
|
105 |
+
self.assertTrue(all(isinstance(t, str) for t in tokens))
|
106 |
+
self.assertTrue(len(tokens) > 0)
|
107 |
+
|
108 |
+
# Get both tokenizations directly to verify comparison
|
109 |
+
_, gsw_tokenizer = load_gsw_tokenizer()
|
110 |
+
swissbert_tokens = tokenize(test_text, tokenizer)
|
111 |
+
gsw_tokens = tokenize(test_text, gsw_tokenizer)
|
112 |
+
|
113 |
+
# Verify that the returned tokenization is the shorter one
|
114 |
+
expected_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens
|
115 |
+
self.assertEqual(tokens, expected_tokens)
|
116 |
+
|
117 |
+
def test_tokenize_mistral(self):
|
118 |
+
"""Test tokenization with Mistral NeMo tokenizer."""
|
119 |
+
tokenizer = self.tokenizers["mistralai/Mistral-Nemo-Instruct-2407"]
|
120 |
+
tokens = tokenize(self.test_text, tokenizer)
|
121 |
+
self.assertIsInstance(tokens, list)
|
122 |
+
self.assertTrue(all(isinstance(t, str) for t in tokens))
|
123 |
+
self.assertTrue(len(tokens) > 0)
|
124 |
+
print(tokens)
|
125 |
+
|
126 |
+
def test_tokenize_gemma(self):
|
127 |
+
"""Test tokenization with Gemma 3 tokenizer."""
|
128 |
+
tokenizer = self.tokenizers["google/gemma-3-27b-it"]
|
129 |
+
tokens = tokenize(self.test_text, tokenizer)
|
130 |
+
self.assertIsInstance(tokens, list)
|
131 |
+
self.assertTrue(all(isinstance(t, str) for t in tokens))
|
132 |
+
self.assertTrue(len(tokens) > 0)
|
133 |
+
print(tokens)
|
134 |
+
|
135 |
+
if __name__ == '__main__':
|
136 |
+
unittest.main()
|
theme.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
gr.themes.builder()
|
utils.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List, Tuple
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
from transformers import AutoTokenizer
|
5 |
+
import tiktoken
|
6 |
+
|
7 |
+
# UZH color palette
|
8 |
+
UZH_COLORS = [
|
9 |
+
"#BACBFF", # UZH Blue V1
|
10 |
+
"#DBF4F9", # UZH Cyan V1
|
11 |
+
"#ECF6D6", # UZH Apple V1
|
12 |
+
"#FFF4DA", # UZH Gold V1
|
13 |
+
"#FFDBCC", # UZH Orange V1
|
14 |
+
"#FBC6D4", # UZH Berry V1
|
15 |
+
"#C2C2C2", # UZH Grey V1
|
16 |
+
"#FAFAFA", # UZH Light Grey V1
|
17 |
+
"#7596FF", # UZH Blue V2
|
18 |
+
"#B7E9F4", # UZH Cyan V2
|
19 |
+
"#DBEDAD", # UZH Apple V2
|
20 |
+
"#FFE9B5", # UZH Gold V2
|
21 |
+
"#FEB799", # UZH Orange V2
|
22 |
+
"#F78CAA", # UZH Berry V2
|
23 |
+
"#A3A3A3", # UZH Grey V2
|
24 |
+
"#EFEFEF", # UZH Light Grey V2
|
25 |
+
]
|
26 |
+
|
27 |
+
def load_hf_tokenizer(name: str) -> Tuple[str, object]:
|
28 |
+
"""
|
29 |
+
Load a single HuggingFace tokenizer.
|
30 |
+
|
31 |
+
Args:
|
32 |
+
name: The name of the tokenizer to load
|
33 |
+
|
34 |
+
Returns:
|
35 |
+
Tuple of (tokenizer_name, tokenizer_object)
|
36 |
+
"""
|
37 |
+
try:
|
38 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
39 |
+
name,
|
40 |
+
use_fast=True,
|
41 |
+
model_max_length=1000000,
|
42 |
+
clean_up_tokenization_spaces=True,
|
43 |
+
legacy=False
|
44 |
+
)
|
45 |
+
except Exception as e:
|
46 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
47 |
+
name,
|
48 |
+
model_max_length=1000000,
|
49 |
+
clean_up_tokenization_spaces=True,
|
50 |
+
legacy=False
|
51 |
+
)
|
52 |
+
return name, tokenizer
|
53 |
+
|
54 |
+
def load_openai_tokenizer(name: str) -> Tuple[str, object]:
|
55 |
+
"""
|
56 |
+
Load a single OpenAI tokenizer.
|
57 |
+
|
58 |
+
Args:
|
59 |
+
name: The name of the tokenizer to load
|
60 |
+
|
61 |
+
Returns:
|
62 |
+
Tuple of (tokenizer_name, tokenizer_object)
|
63 |
+
"""
|
64 |
+
return name, tiktoken.encoding_for_model(name)
|
65 |
+
|
66 |
+
def load_gsw_tokenizer() -> Tuple[str, object]:
|
67 |
+
"""
|
68 |
+
Load a Swiss German (GSW) tokenizer from local vocabulary files in gsw_tokenizer directory.
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
Tuple of (tokenizer_name, tokenizer_object)
|
72 |
+
"""
|
73 |
+
tokenizer_path = Path(__file__).parent / "gsw_tokenizer"
|
74 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
|
75 |
+
return "swissbert-gsw", tokenizer
|
76 |
+
|
77 |
+
def load_tokenizers() -> Dict[str, object]:
|
78 |
+
"""
|
79 |
+
Load all tokenizers.
|
80 |
+
|
81 |
+
Returns:
|
82 |
+
Dictionary mapping tokenizer names to tokenizer objects
|
83 |
+
"""
|
84 |
+
tokenizers = {}
|
85 |
+
|
86 |
+
# Load OpenAI tokenizers first
|
87 |
+
openai_names = ["gpt-4o"]
|
88 |
+
for name in openai_names:
|
89 |
+
tokenizer_name, tokenizer = load_openai_tokenizer(name)
|
90 |
+
tokenizers[tokenizer_name] = tokenizer
|
91 |
+
|
92 |
+
# Load HuggingFace tokenizers in specified order
|
93 |
+
hf_names = [
|
94 |
+
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
95 |
+
"deepseek-ai/DeepSeek-V3-0324",
|
96 |
+
"ZurichNLP/swissbert",
|
97 |
+
"google/gemma-3-27b-it",
|
98 |
+
"mistralai/Mistral-Nemo-Instruct-2407",
|
99 |
+
"CohereLabs/aya-expanse-8b",
|
100 |
+
]
|
101 |
+
for name in hf_names:
|
102 |
+
tokenizer_name, tokenizer = load_hf_tokenizer(name)
|
103 |
+
tokenizers[tokenizer_name] = tokenizer
|
104 |
+
|
105 |
+
return tokenizers
|
106 |
+
|
107 |
+
# Mapping of model names to display names
|
108 |
+
MODEL_DISPLAY_NAMES = {
|
109 |
+
"meta-llama/Llama-4-Scout-17B-16E-Instruct": "Llama 4",
|
110 |
+
"deepseek-ai/DeepSeek-V3-0324": "DeepSeek V3",
|
111 |
+
"ZurichNLP/swissbert": "SwissBERT 🇨🇭",
|
112 |
+
"mistralai/Mistral-Nemo-Instruct-2407": "Mistral NeMo",
|
113 |
+
"google/gemma-3-27b-it": "Gemma 3",
|
114 |
+
"gpt-4o": "ChatGPT (gpt-4o)",
|
115 |
+
"CohereLabs/aya-expanse-8b": "Aya Expanse"
|
116 |
+
}
|
117 |
+
|
118 |
+
def tokenize(s: str, tokenizer) -> List[str]:
|
119 |
+
"""
|
120 |
+
Tokenize a string using any tokenizer from load_hf_tokenizers() or load_openai_tokenizers().
|
121 |
+
For SwissBERT tokenizer, compares both SwissBERT and SwissBERT-GSW tokenizations and returns the shorter one.
|
122 |
+
|
123 |
+
Args:
|
124 |
+
s: The string to tokenize
|
125 |
+
tokenizer: A tokenizer from load_hf_tokenizers() or load_openai_tokenizers()
|
126 |
+
|
127 |
+
Returns:
|
128 |
+
A list of tokens, with special tokens removed and any tail token markers (## or @@) removed
|
129 |
+
"""
|
130 |
+
# Special handling for SwissBERT tokenizer
|
131 |
+
if hasattr(tokenizer, "name_or_path") and "swissbert" in tokenizer.name_or_path.lower():
|
132 |
+
# Get SwissBERT-GSW tokenizer
|
133 |
+
_, gsw_tokenizer = load_gsw_tokenizer()
|
134 |
+
|
135 |
+
# Get tokenizations from both tokenizers§
|
136 |
+
swissbert_tokens = _tokenize_with_tokenizer(s, tokenizer)
|
137 |
+
gsw_tokens = _tokenize_with_tokenizer(s, gsw_tokenizer)
|
138 |
+
|
139 |
+
# Return the shorter tokenization
|
140 |
+
shorter_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens
|
141 |
+
if len(shorter_tokens) > 0 and shorter_tokens[0].startswith(" "):
|
142 |
+
shorter_tokens[0] = shorter_tokens[0][1:]
|
143 |
+
return shorter_tokens
|
144 |
+
|
145 |
+
return _tokenize_with_tokenizer(s, tokenizer)
|
146 |
+
|
147 |
+
def _tokenize_with_tokenizer(s: str, tokenizer) -> List[str]:
|
148 |
+
"""
|
149 |
+
Internal helper function to tokenize a string with a given tokenizer.
|
150 |
+
|
151 |
+
Args:
|
152 |
+
s: The string to tokenize
|
153 |
+
tokenizer: A tokenizer object
|
154 |
+
|
155 |
+
Returns:
|
156 |
+
A list of tokens, with special tokens removed and any tail token markers (## or @@) removed
|
157 |
+
"""
|
158 |
+
if hasattr(tokenizer, "tokenize"):
|
159 |
+
encoded = tokenizer.encode(s, add_special_tokens=False)
|
160 |
+
if hasattr(tokenizer, "name_or_path") and any(name in tokenizer.name_or_path.lower() for name in ["llama", "deepseek", "mistral", "aya"]):
|
161 |
+
tokens = [tokenizer.decode([token_id], skip_special_tokens=False) for token_id in encoded]
|
162 |
+
else:
|
163 |
+
tokens = tokenizer.convert_ids_to_tokens(encoded)
|
164 |
+
|
165 |
+
filtered_tokens = []
|
166 |
+
for t in tokens:
|
167 |
+
if t.startswith("<") or t.startswith("["):
|
168 |
+
continue
|
169 |
+
elif "Ġ" in t:
|
170 |
+
filtered_tokens.append(t.replace("Ġ", " "))
|
171 |
+
elif "Ċ" in t:
|
172 |
+
filtered_tokens.append(t.replace("Ċ", " "))
|
173 |
+
elif t.startswith("▁"):
|
174 |
+
filtered_tokens.append(" " + t[1:])
|
175 |
+
else:
|
176 |
+
filtered_tokens.append(t)
|
177 |
+
|
178 |
+
return [t.rstrip("##").rstrip("@@") for t in filtered_tokens]
|
179 |
+
|
180 |
+
elif hasattr(tokenizer, "encode"):
|
181 |
+
token_ids = tokenizer.encode(s)
|
182 |
+
return [tokenizer.decode([token_id]) for token_id in token_ids]
|
183 |
+
|
184 |
+
else:
|
185 |
+
raise ValueError("Unsupported tokenizer type")
|
186 |
+
|
187 |
+
def get_uzh_color(index):
|
188 |
+
"""Get a color from the UZH color palette based on index."""
|
189 |
+
return UZH_COLORS[index % len(UZH_COLORS)]
|
190 |
+
|
191 |
+
def visualize_tokens(text: str, tokenizers: Dict[str, object]):
|
192 |
+
"""
|
193 |
+
Tokenize text with each tokenizer and visualize the tokens with colors.
|
194 |
+
Colors are consistent across tokenizers for the same token sequences.
|
195 |
+
Colors are deterministic based on token content.
|
196 |
+
|
197 |
+
Args:
|
198 |
+
text: The input text to tokenize
|
199 |
+
tokenizers: Dictionary of tokenizers
|
200 |
+
|
201 |
+
Returns:
|
202 |
+
Dictionary mapping tokenizer names to HTML visualizations
|
203 |
+
"""
|
204 |
+
results = {}
|
205 |
+
|
206 |
+
# First pass: collect all unique tokens across all tokenizers
|
207 |
+
all_tokens = set()
|
208 |
+
for tokenizer in tokenizers.values():
|
209 |
+
tokens = tokenize(text, tokenizer)
|
210 |
+
all_tokens.update(tokens)
|
211 |
+
|
212 |
+
# Generate colors for all unique tokens using hash-based approach
|
213 |
+
token_colors = {}
|
214 |
+
for token in all_tokens:
|
215 |
+
# Use hash of token to get a deterministic index
|
216 |
+
token_hash = hash(token)
|
217 |
+
# Ensure positive index and wrap around to color list length
|
218 |
+
index = abs(token_hash) % len(UZH_COLORS)
|
219 |
+
token_colors[token] = get_uzh_color(index)
|
220 |
+
|
221 |
+
# Second pass: create visualizations using the consistent colors
|
222 |
+
for name, tokenizer in tokenizers.items():
|
223 |
+
tokens = tokenize(text, tokenizer)
|
224 |
+
|
225 |
+
# Create a colored visualization
|
226 |
+
html = ""
|
227 |
+
|
228 |
+
# Build the HTML with colored spans for each token
|
229 |
+
for token in tokens:
|
230 |
+
color = token_colors[token]
|
231 |
+
html += f'<span style="background-color: {color}; padding: 2px; margin: 1px; border-radius: 3px;">{token}</span>'
|
232 |
+
|
233 |
+
results[name] = html
|
234 |
+
|
235 |
+
return results
|