File size: 7,978 Bytes
a35d485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1390fb6
a35d485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
from typing import Dict, List, Tuple
from pathlib import Path

from transformers import AutoTokenizer
import tiktoken

# UZH color palette
UZH_COLORS = [
    "#BACBFF",  # UZH Blue V1
    "#DBF4F9",  # UZH Cyan V1
    "#ECF6D6",  # UZH Apple V1
    "#FFF4DA",  # UZH Gold V1
    "#FFDBCC",  # UZH Orange V1
    "#FBC6D4",  # UZH Berry V1
    "#C2C2C2",  # UZH Grey V1
    "#FAFAFA",  # UZH Light Grey V1
    "#7596FF",  # UZH Blue V2
    "#B7E9F4",  # UZH Cyan V2
    "#DBEDAD",  # UZH Apple V2
    "#FFE9B5",  # UZH Gold V2
    "#FEB799",  # UZH Orange V2
    "#F78CAA",  # UZH Berry V2
    "#A3A3A3",  # UZH Grey V2
    "#EFEFEF",  # UZH Light Grey V2
]

def load_hf_tokenizer(name: str) -> Tuple[str, object]:
    """
    Load a single HuggingFace tokenizer.
    
    Args:
        name: The name of the tokenizer to load
        
    Returns:
        Tuple of (tokenizer_name, tokenizer_object)
    """
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            name,
            use_fast=True,
            model_max_length=1000000,
            clean_up_tokenization_spaces=True,
            legacy=False
        )
    except Exception as e:
        tokenizer = AutoTokenizer.from_pretrained(
            name,
            model_max_length=1000000,
            clean_up_tokenization_spaces=True,
            legacy=False
        )
    return name, tokenizer

def load_openai_tokenizer(name: str) -> Tuple[str, object]:
    """
    Load a single OpenAI tokenizer.
    
    Args:
        name: The name of the tokenizer to load
        
    Returns:
        Tuple of (tokenizer_name, tokenizer_object)
    """
    return name, tiktoken.encoding_for_model(name)

def load_gsw_tokenizer() -> Tuple[str, object]:
    """
    Load a Swiss German (GSW) tokenizer from local vocabulary files in gsw_tokenizer directory.
    
    Returns:
        Tuple of (tokenizer_name, tokenizer_object)
    """
    tokenizer = AutoTokenizer.from_pretrained("jvamvas/swissbert-gsw-vocab")
    return "swissbert-gsw", tokenizer

def load_tokenizers() -> Dict[str, object]:
    """
    Load all tokenizers.
    
    Returns:
        Dictionary mapping tokenizer names to tokenizer objects
    """
    tokenizers = {}
    
    # Load OpenAI tokenizers first
    openai_names = ["gpt-4o"]
    for name in openai_names:
        tokenizer_name, tokenizer = load_openai_tokenizer(name)
        tokenizers[tokenizer_name] = tokenizer
    
    # Load HuggingFace tokenizers in specified order
    hf_names = [
        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
        "deepseek-ai/DeepSeek-V3-0324",
        "ZurichNLP/swissbert",
        "google/gemma-3-27b-it",
        "mistralai/Mistral-Nemo-Instruct-2407",
        "CohereLabs/aya-expanse-8b",
    ]
    for name in hf_names:
        tokenizer_name, tokenizer = load_hf_tokenizer(name)
        tokenizers[tokenizer_name] = tokenizer
    
    return tokenizers

# Mapping of model names to display names
MODEL_DISPLAY_NAMES = {
    "meta-llama/Llama-4-Scout-17B-16E-Instruct": "Llama 4",
    "deepseek-ai/DeepSeek-V3-0324": "DeepSeek V3",
    "ZurichNLP/swissbert": "SwissBERT 🇨🇭",
    "mistralai/Mistral-Nemo-Instruct-2407": "Mistral NeMo",
    "google/gemma-3-27b-it": "Gemma 3",
    "gpt-4o": "ChatGPT (gpt-4o)",
    "CohereLabs/aya-expanse-8b": "Aya Expanse"
}

def tokenize(s: str, tokenizer) -> List[str]:
    """
    Tokenize a string using any tokenizer from load_hf_tokenizers() or load_openai_tokenizers().
    For SwissBERT tokenizer, compares both SwissBERT and SwissBERT-GSW tokenizations and returns the shorter one.
    
    Args:
        s: The string to tokenize
        tokenizer: A tokenizer from load_hf_tokenizers() or load_openai_tokenizers()
        
    Returns:
        A list of tokens, with special tokens removed and any tail token markers (## or @@) removed
    """
    # Special handling for SwissBERT tokenizer
    if hasattr(tokenizer, "name_or_path") and "swissbert" in tokenizer.name_or_path.lower():
        # Get SwissBERT-GSW tokenizer
        _, gsw_tokenizer = load_gsw_tokenizer()
        
        # Get tokenizations from both tokenizers§
        swissbert_tokens = _tokenize_with_tokenizer(s, tokenizer)
        gsw_tokens = _tokenize_with_tokenizer(s, gsw_tokenizer)
        
        # Return the shorter tokenization
        shorter_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens
        if len(shorter_tokens) > 0 and shorter_tokens[0].startswith(" "):
            shorter_tokens[0] = shorter_tokens[0][1:]
        return shorter_tokens
    
    return _tokenize_with_tokenizer(s, tokenizer)

def _tokenize_with_tokenizer(s: str, tokenizer) -> List[str]:
    """
    Internal helper function to tokenize a string with a given tokenizer.
    
    Args:
        s: The string to tokenize
        tokenizer: A tokenizer object
        
    Returns:
        A list of tokens, with special tokens removed and any tail token markers (## or @@) removed
    """
    if hasattr(tokenizer, "tokenize"):
        encoded = tokenizer.encode(s, add_special_tokens=False)
        if hasattr(tokenizer, "name_or_path") and any(name in tokenizer.name_or_path.lower() for name in ["llama", "deepseek", "mistral", "aya"]):
            tokens = [tokenizer.decode([token_id], skip_special_tokens=False) for token_id in encoded]
        else:
            tokens = tokenizer.convert_ids_to_tokens(encoded)
        
        filtered_tokens = []
        for t in tokens:
            if t.startswith("<") or t.startswith("["):
                continue
            elif "Ġ" in t:
                filtered_tokens.append(t.replace("Ġ", " "))
            elif "Ċ" in t:
                filtered_tokens.append(t.replace("Ċ", " "))
            elif t.startswith("▁"):
                filtered_tokens.append(" " + t[1:])
            else:
                filtered_tokens.append(t)
        
        return [t.rstrip("##").rstrip("@@") for t in filtered_tokens]
    
    elif hasattr(tokenizer, "encode"):
        token_ids = tokenizer.encode(s)
        return [tokenizer.decode([token_id]) for token_id in token_ids]
    
    else:
        raise ValueError("Unsupported tokenizer type")

def get_uzh_color(index):
    """Get a color from the UZH color palette based on index."""
    return UZH_COLORS[index % len(UZH_COLORS)]

def visualize_tokens(text: str, tokenizers: Dict[str, object]):
    """
    Tokenize text with each tokenizer and visualize the tokens with colors.
    Colors are consistent across tokenizers for the same token sequences.
    Colors are deterministic based on token content.
    
    Args:
        text: The input text to tokenize
        tokenizers: Dictionary of tokenizers
        
    Returns:
        Dictionary mapping tokenizer names to HTML visualizations
    """
    results = {}
    
    # First pass: collect all unique tokens across all tokenizers
    all_tokens = set()
    for tokenizer in tokenizers.values():
        tokens = tokenize(text, tokenizer)
        all_tokens.update(tokens)
    
    # Generate colors for all unique tokens using hash-based approach
    token_colors = {}
    for token in all_tokens:
        # Use hash of token to get a deterministic index
        token_hash = hash(token)
        # Ensure positive index and wrap around to color list length
        index = abs(token_hash) % len(UZH_COLORS)
        token_colors[token] = get_uzh_color(index)
    
    # Second pass: create visualizations using the consistent colors
    for name, tokenizer in tokenizers.items():
        tokens = tokenize(text, tokenizer)
        
        # Create a colored visualization
        html = ""
        
        # Build the HTML with colored spans for each token
        for token in tokens:
            color = token_colors[token]
            html += f'<span style="background-color: {color}; padding: 2px; margin: 1px; border-radius: 3px;">{token}</span>'
        
        results[name] = html
    
    return results