from typing import Dict, List, Tuple from pathlib import Path from transformers import AutoTokenizer import tiktoken # UZH color palette UZH_COLORS = [ "#BACBFF", # UZH Blue V1 "#DBF4F9", # UZH Cyan V1 "#ECF6D6", # UZH Apple V1 "#FFF4DA", # UZH Gold V1 "#FFDBCC", # UZH Orange V1 "#FBC6D4", # UZH Berry V1 "#C2C2C2", # UZH Grey V1 "#FAFAFA", # UZH Light Grey V1 "#7596FF", # UZH Blue V2 "#B7E9F4", # UZH Cyan V2 "#DBEDAD", # UZH Apple V2 "#FFE9B5", # UZH Gold V2 "#FEB799", # UZH Orange V2 "#F78CAA", # UZH Berry V2 "#A3A3A3", # UZH Grey V2 "#EFEFEF", # UZH Light Grey V2 ] def load_hf_tokenizer(name: str) -> Tuple[str, object]: """ Load a single HuggingFace tokenizer. Args: name: The name of the tokenizer to load Returns: Tuple of (tokenizer_name, tokenizer_object) """ try: tokenizer = AutoTokenizer.from_pretrained( name, use_fast=True, model_max_length=1000000, clean_up_tokenization_spaces=True, legacy=False ) except Exception as e: tokenizer = AutoTokenizer.from_pretrained( name, model_max_length=1000000, clean_up_tokenization_spaces=True, legacy=False ) return name, tokenizer def load_openai_tokenizer(name: str) -> Tuple[str, object]: """ Load a single OpenAI tokenizer. Args: name: The name of the tokenizer to load Returns: Tuple of (tokenizer_name, tokenizer_object) """ return name, tiktoken.encoding_for_model(name) def load_gsw_tokenizer() -> Tuple[str, object]: """ Load a Swiss German (GSW) tokenizer from local vocabulary files in gsw_tokenizer directory. Returns: Tuple of (tokenizer_name, tokenizer_object) """ tokenizer = AutoTokenizer.from_pretrained("jvamvas/swissbert-gsw-vocab") return "swissbert-gsw", tokenizer def load_tokenizers() -> Dict[str, object]: """ Load all tokenizers. Returns: Dictionary mapping tokenizer names to tokenizer objects """ tokenizers = {} # Load OpenAI tokenizers first openai_names = ["gpt-4o"] for name in openai_names: tokenizer_name, tokenizer = load_openai_tokenizer(name) tokenizers[tokenizer_name] = tokenizer # Load HuggingFace tokenizers in specified order hf_names = [ "meta-llama/Llama-4-Scout-17B-16E-Instruct", "deepseek-ai/DeepSeek-V3-0324", "ZurichNLP/swissbert", "google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b", ] for name in hf_names: tokenizer_name, tokenizer = load_hf_tokenizer(name) tokenizers[tokenizer_name] = tokenizer return tokenizers # Mapping of model names to display names MODEL_DISPLAY_NAMES = { "meta-llama/Llama-4-Scout-17B-16E-Instruct": "Llama 4", "deepseek-ai/DeepSeek-V3-0324": "DeepSeek V3", "ZurichNLP/swissbert": "SwissBERT 🇨🇭", "mistralai/Mistral-Nemo-Instruct-2407": "Mistral NeMo", "google/gemma-3-27b-it": "Gemma 3", "gpt-4o": "ChatGPT (gpt-4o)", "CohereLabs/aya-expanse-8b": "Aya Expanse" } def tokenize(s: str, tokenizer) -> List[str]: """ Tokenize a string using any tokenizer from load_hf_tokenizers() or load_openai_tokenizers(). For SwissBERT tokenizer, compares both SwissBERT and SwissBERT-GSW tokenizations and returns the shorter one. Args: s: The string to tokenize tokenizer: A tokenizer from load_hf_tokenizers() or load_openai_tokenizers() Returns: A list of tokens, with special tokens removed and any tail token markers (## or @@) removed """ # Special handling for SwissBERT tokenizer if hasattr(tokenizer, "name_or_path") and "swissbert" in tokenizer.name_or_path.lower(): # Get SwissBERT-GSW tokenizer _, gsw_tokenizer = load_gsw_tokenizer() # Get tokenizations from both tokenizers§ swissbert_tokens = _tokenize_with_tokenizer(s, tokenizer) gsw_tokens = _tokenize_with_tokenizer(s, gsw_tokenizer) # Return the shorter tokenization shorter_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens if len(shorter_tokens) > 0 and shorter_tokens[0].startswith(" "): shorter_tokens[0] = shorter_tokens[0][1:] return shorter_tokens return _tokenize_with_tokenizer(s, tokenizer) def _tokenize_with_tokenizer(s: str, tokenizer) -> List[str]: """ Internal helper function to tokenize a string with a given tokenizer. Args: s: The string to tokenize tokenizer: A tokenizer object Returns: A list of tokens, with special tokens removed and any tail token markers (## or @@) removed """ if hasattr(tokenizer, "tokenize"): encoded = tokenizer.encode(s, add_special_tokens=False) if hasattr(tokenizer, "name_or_path") and any(name in tokenizer.name_or_path.lower() for name in ["llama", "deepseek", "mistral", "aya"]): tokens = [tokenizer.decode([token_id], skip_special_tokens=False) for token_id in encoded] else: tokens = tokenizer.convert_ids_to_tokens(encoded) filtered_tokens = [] for t in tokens: if t.startswith("<") or t.startswith("["): continue elif "Ġ" in t: filtered_tokens.append(t.replace("Ġ", " ")) elif "Ċ" in t: filtered_tokens.append(t.replace("Ċ", " ")) elif t.startswith("▁"): filtered_tokens.append(" " + t[1:]) else: filtered_tokens.append(t) return [t.rstrip("##").rstrip("@@") for t in filtered_tokens] elif hasattr(tokenizer, "encode"): token_ids = tokenizer.encode(s) return [tokenizer.decode([token_id]) for token_id in token_ids] else: raise ValueError("Unsupported tokenizer type") def get_uzh_color(index): """Get a color from the UZH color palette based on index.""" return UZH_COLORS[index % len(UZH_COLORS)] def visualize_tokens(text: str, tokenizers: Dict[str, object]): """ Tokenize text with each tokenizer and visualize the tokens with colors. Colors are consistent across tokenizers for the same token sequences. Colors are deterministic based on token content. Args: text: The input text to tokenize tokenizers: Dictionary of tokenizers Returns: Dictionary mapping tokenizer names to HTML visualizations """ results = {} # First pass: collect all unique tokens across all tokenizers all_tokens = set() for tokenizer in tokenizers.values(): tokens = tokenize(text, tokenizer) all_tokens.update(tokens) # Generate colors for all unique tokens using hash-based approach token_colors = {} for token in all_tokens: # Use hash of token to get a deterministic index token_hash = hash(token) # Ensure positive index and wrap around to color list length index = abs(token_hash) % len(UZH_COLORS) token_colors[token] = get_uzh_color(index) # Second pass: create visualizations using the consistent colors for name, tokenizer in tokenizers.items(): tokens = tokenize(text, tokenizer) # Create a colored visualization html = "" # Build the HTML with colored spans for each token for token in tokens: color = token_colors[token] html += f'{token}' results[name] = html return results