jvamvas's picture
Use gsw vocab from hf hub
1390fb6
from typing import Dict, List, Tuple
from pathlib import Path
from transformers import AutoTokenizer
import tiktoken
# UZH color palette
UZH_COLORS = [
"#BACBFF", # UZH Blue V1
"#DBF4F9", # UZH Cyan V1
"#ECF6D6", # UZH Apple V1
"#FFF4DA", # UZH Gold V1
"#FFDBCC", # UZH Orange V1
"#FBC6D4", # UZH Berry V1
"#C2C2C2", # UZH Grey V1
"#FAFAFA", # UZH Light Grey V1
"#7596FF", # UZH Blue V2
"#B7E9F4", # UZH Cyan V2
"#DBEDAD", # UZH Apple V2
"#FFE9B5", # UZH Gold V2
"#FEB799", # UZH Orange V2
"#F78CAA", # UZH Berry V2
"#A3A3A3", # UZH Grey V2
"#EFEFEF", # UZH Light Grey V2
]
def load_hf_tokenizer(name: str) -> Tuple[str, object]:
"""
Load a single HuggingFace tokenizer.
Args:
name: The name of the tokenizer to load
Returns:
Tuple of (tokenizer_name, tokenizer_object)
"""
try:
tokenizer = AutoTokenizer.from_pretrained(
name,
use_fast=True,
model_max_length=1000000,
clean_up_tokenization_spaces=True,
legacy=False
)
except Exception as e:
tokenizer = AutoTokenizer.from_pretrained(
name,
model_max_length=1000000,
clean_up_tokenization_spaces=True,
legacy=False
)
return name, tokenizer
def load_openai_tokenizer(name: str) -> Tuple[str, object]:
"""
Load a single OpenAI tokenizer.
Args:
name: The name of the tokenizer to load
Returns:
Tuple of (tokenizer_name, tokenizer_object)
"""
return name, tiktoken.encoding_for_model(name)
def load_gsw_tokenizer() -> Tuple[str, object]:
"""
Load a Swiss German (GSW) tokenizer from local vocabulary files in gsw_tokenizer directory.
Returns:
Tuple of (tokenizer_name, tokenizer_object)
"""
tokenizer = AutoTokenizer.from_pretrained("jvamvas/swissbert-gsw-vocab")
return "swissbert-gsw", tokenizer
def load_tokenizers() -> Dict[str, object]:
"""
Load all tokenizers.
Returns:
Dictionary mapping tokenizer names to tokenizer objects
"""
tokenizers = {}
# Load OpenAI tokenizers first
openai_names = ["gpt-4o"]
for name in openai_names:
tokenizer_name, tokenizer = load_openai_tokenizer(name)
tokenizers[tokenizer_name] = tokenizer
# Load HuggingFace tokenizers in specified order
hf_names = [
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"deepseek-ai/DeepSeek-V3-0324",
"ZurichNLP/swissbert",
"google/gemma-3-27b-it",
"mistralai/Mistral-Nemo-Instruct-2407",
"CohereLabs/aya-expanse-8b",
]
for name in hf_names:
tokenizer_name, tokenizer = load_hf_tokenizer(name)
tokenizers[tokenizer_name] = tokenizer
return tokenizers
# Mapping of model names to display names
MODEL_DISPLAY_NAMES = {
"meta-llama/Llama-4-Scout-17B-16E-Instruct": "Llama 4",
"deepseek-ai/DeepSeek-V3-0324": "DeepSeek V3",
"ZurichNLP/swissbert": "SwissBERT 🇨🇭",
"mistralai/Mistral-Nemo-Instruct-2407": "Mistral NeMo",
"google/gemma-3-27b-it": "Gemma 3",
"gpt-4o": "ChatGPT (gpt-4o)",
"CohereLabs/aya-expanse-8b": "Aya Expanse"
}
def tokenize(s: str, tokenizer) -> List[str]:
"""
Tokenize a string using any tokenizer from load_hf_tokenizers() or load_openai_tokenizers().
For SwissBERT tokenizer, compares both SwissBERT and SwissBERT-GSW tokenizations and returns the shorter one.
Args:
s: The string to tokenize
tokenizer: A tokenizer from load_hf_tokenizers() or load_openai_tokenizers()
Returns:
A list of tokens, with special tokens removed and any tail token markers (## or @@) removed
"""
# Special handling for SwissBERT tokenizer
if hasattr(tokenizer, "name_or_path") and "swissbert" in tokenizer.name_or_path.lower():
# Get SwissBERT-GSW tokenizer
_, gsw_tokenizer = load_gsw_tokenizer()
# Get tokenizations from both tokenizers§
swissbert_tokens = _tokenize_with_tokenizer(s, tokenizer)
gsw_tokens = _tokenize_with_tokenizer(s, gsw_tokenizer)
# Return the shorter tokenization
shorter_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens
if len(shorter_tokens) > 0 and shorter_tokens[0].startswith(" "):
shorter_tokens[0] = shorter_tokens[0][1:]
return shorter_tokens
return _tokenize_with_tokenizer(s, tokenizer)
def _tokenize_with_tokenizer(s: str, tokenizer) -> List[str]:
"""
Internal helper function to tokenize a string with a given tokenizer.
Args:
s: The string to tokenize
tokenizer: A tokenizer object
Returns:
A list of tokens, with special tokens removed and any tail token markers (## or @@) removed
"""
if hasattr(tokenizer, "tokenize"):
encoded = tokenizer.encode(s, add_special_tokens=False)
if hasattr(tokenizer, "name_or_path") and any(name in tokenizer.name_or_path.lower() for name in ["llama", "deepseek", "mistral", "aya"]):
tokens = [tokenizer.decode([token_id], skip_special_tokens=False) for token_id in encoded]
else:
tokens = tokenizer.convert_ids_to_tokens(encoded)
filtered_tokens = []
for t in tokens:
if t.startswith("<") or t.startswith("["):
continue
elif "Ġ" in t:
filtered_tokens.append(t.replace("Ġ", " "))
elif "Ċ" in t:
filtered_tokens.append(t.replace("Ċ", " "))
elif t.startswith("▁"):
filtered_tokens.append(" " + t[1:])
else:
filtered_tokens.append(t)
return [t.rstrip("##").rstrip("@@") for t in filtered_tokens]
elif hasattr(tokenizer, "encode"):
token_ids = tokenizer.encode(s)
return [tokenizer.decode([token_id]) for token_id in token_ids]
else:
raise ValueError("Unsupported tokenizer type")
def get_uzh_color(index):
"""Get a color from the UZH color palette based on index."""
return UZH_COLORS[index % len(UZH_COLORS)]
def visualize_tokens(text: str, tokenizers: Dict[str, object]):
"""
Tokenize text with each tokenizer and visualize the tokens with colors.
Colors are consistent across tokenizers for the same token sequences.
Colors are deterministic based on token content.
Args:
text: The input text to tokenize
tokenizers: Dictionary of tokenizers
Returns:
Dictionary mapping tokenizer names to HTML visualizations
"""
results = {}
# First pass: collect all unique tokens across all tokenizers
all_tokens = set()
for tokenizer in tokenizers.values():
tokens = tokenize(text, tokenizer)
all_tokens.update(tokens)
# Generate colors for all unique tokens using hash-based approach
token_colors = {}
for token in all_tokens:
# Use hash of token to get a deterministic index
token_hash = hash(token)
# Ensure positive index and wrap around to color list length
index = abs(token_hash) % len(UZH_COLORS)
token_colors[token] = get_uzh_color(index)
# Second pass: create visualizations using the consistent colors
for name, tokenizer in tokenizers.items():
tokens = tokenize(text, tokenizer)
# Create a colored visualization
html = ""
# Build the HTML with colored spans for each token
for token in tokens:
color = token_colors[token]
html += f'<span style="background-color: {color}; padding: 2px; margin: 1px; border-radius: 3px;">{token}</span>'
results[name] = html
return results