Spaces:
Running
Running
from typing import Dict, List, Tuple | |
from pathlib import Path | |
from transformers import AutoTokenizer | |
import tiktoken | |
# UZH color palette | |
UZH_COLORS = [ | |
"#BACBFF", # UZH Blue V1 | |
"#DBF4F9", # UZH Cyan V1 | |
"#ECF6D6", # UZH Apple V1 | |
"#FFF4DA", # UZH Gold V1 | |
"#FFDBCC", # UZH Orange V1 | |
"#FBC6D4", # UZH Berry V1 | |
"#C2C2C2", # UZH Grey V1 | |
"#FAFAFA", # UZH Light Grey V1 | |
"#7596FF", # UZH Blue V2 | |
"#B7E9F4", # UZH Cyan V2 | |
"#DBEDAD", # UZH Apple V2 | |
"#FFE9B5", # UZH Gold V2 | |
"#FEB799", # UZH Orange V2 | |
"#F78CAA", # UZH Berry V2 | |
"#A3A3A3", # UZH Grey V2 | |
"#EFEFEF", # UZH Light Grey V2 | |
] | |
def load_hf_tokenizer(name: str) -> Tuple[str, object]: | |
""" | |
Load a single HuggingFace tokenizer. | |
Args: | |
name: The name of the tokenizer to load | |
Returns: | |
Tuple of (tokenizer_name, tokenizer_object) | |
""" | |
try: | |
tokenizer = AutoTokenizer.from_pretrained( | |
name, | |
use_fast=True, | |
model_max_length=1000000, | |
clean_up_tokenization_spaces=True, | |
legacy=False | |
) | |
except Exception as e: | |
tokenizer = AutoTokenizer.from_pretrained( | |
name, | |
model_max_length=1000000, | |
clean_up_tokenization_spaces=True, | |
legacy=False | |
) | |
return name, tokenizer | |
def load_openai_tokenizer(name: str) -> Tuple[str, object]: | |
""" | |
Load a single OpenAI tokenizer. | |
Args: | |
name: The name of the tokenizer to load | |
Returns: | |
Tuple of (tokenizer_name, tokenizer_object) | |
""" | |
return name, tiktoken.encoding_for_model(name) | |
def load_gsw_tokenizer() -> Tuple[str, object]: | |
""" | |
Load a Swiss German (GSW) tokenizer from local vocabulary files in gsw_tokenizer directory. | |
Returns: | |
Tuple of (tokenizer_name, tokenizer_object) | |
""" | |
tokenizer = AutoTokenizer.from_pretrained("jvamvas/swissbert-gsw-vocab") | |
return "swissbert-gsw", tokenizer | |
def load_tokenizers() -> Dict[str, object]: | |
""" | |
Load all tokenizers. | |
Returns: | |
Dictionary mapping tokenizer names to tokenizer objects | |
""" | |
tokenizers = {} | |
# Load OpenAI tokenizers first | |
openai_names = ["gpt-4o"] | |
for name in openai_names: | |
tokenizer_name, tokenizer = load_openai_tokenizer(name) | |
tokenizers[tokenizer_name] = tokenizer | |
# Load HuggingFace tokenizers in specified order | |
hf_names = [ | |
"meta-llama/Llama-4-Scout-17B-16E-Instruct", | |
"deepseek-ai/DeepSeek-V3-0324", | |
"ZurichNLP/swissbert", | |
"google/gemma-3-27b-it", | |
"mistralai/Mistral-Nemo-Instruct-2407", | |
"CohereLabs/aya-expanse-8b", | |
] | |
for name in hf_names: | |
tokenizer_name, tokenizer = load_hf_tokenizer(name) | |
tokenizers[tokenizer_name] = tokenizer | |
return tokenizers | |
# Mapping of model names to display names | |
MODEL_DISPLAY_NAMES = { | |
"meta-llama/Llama-4-Scout-17B-16E-Instruct": "Llama 4", | |
"deepseek-ai/DeepSeek-V3-0324": "DeepSeek V3", | |
"ZurichNLP/swissbert": "SwissBERT 🇨🇭", | |
"mistralai/Mistral-Nemo-Instruct-2407": "Mistral NeMo", | |
"google/gemma-3-27b-it": "Gemma 3", | |
"gpt-4o": "ChatGPT (gpt-4o)", | |
"CohereLabs/aya-expanse-8b": "Aya Expanse" | |
} | |
def tokenize(s: str, tokenizer) -> List[str]: | |
""" | |
Tokenize a string using any tokenizer from load_hf_tokenizers() or load_openai_tokenizers(). | |
For SwissBERT tokenizer, compares both SwissBERT and SwissBERT-GSW tokenizations and returns the shorter one. | |
Args: | |
s: The string to tokenize | |
tokenizer: A tokenizer from load_hf_tokenizers() or load_openai_tokenizers() | |
Returns: | |
A list of tokens, with special tokens removed and any tail token markers (## or @@) removed | |
""" | |
# Special handling for SwissBERT tokenizer | |
if hasattr(tokenizer, "name_or_path") and "swissbert" in tokenizer.name_or_path.lower(): | |
# Get SwissBERT-GSW tokenizer | |
_, gsw_tokenizer = load_gsw_tokenizer() | |
# Get tokenizations from both tokenizers§ | |
swissbert_tokens = _tokenize_with_tokenizer(s, tokenizer) | |
gsw_tokens = _tokenize_with_tokenizer(s, gsw_tokenizer) | |
# Return the shorter tokenization | |
shorter_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens | |
if len(shorter_tokens) > 0 and shorter_tokens[0].startswith(" "): | |
shorter_tokens[0] = shorter_tokens[0][1:] | |
return shorter_tokens | |
return _tokenize_with_tokenizer(s, tokenizer) | |
def _tokenize_with_tokenizer(s: str, tokenizer) -> List[str]: | |
""" | |
Internal helper function to tokenize a string with a given tokenizer. | |
Args: | |
s: The string to tokenize | |
tokenizer: A tokenizer object | |
Returns: | |
A list of tokens, with special tokens removed and any tail token markers (## or @@) removed | |
""" | |
if hasattr(tokenizer, "tokenize"): | |
encoded = tokenizer.encode(s, add_special_tokens=False) | |
if hasattr(tokenizer, "name_or_path") and any(name in tokenizer.name_or_path.lower() for name in ["llama", "deepseek", "mistral", "aya"]): | |
tokens = [tokenizer.decode([token_id], skip_special_tokens=False) for token_id in encoded] | |
else: | |
tokens = tokenizer.convert_ids_to_tokens(encoded) | |
filtered_tokens = [] | |
for t in tokens: | |
if t.startswith("<") or t.startswith("["): | |
continue | |
elif "Ġ" in t: | |
filtered_tokens.append(t.replace("Ġ", " ")) | |
elif "Ċ" in t: | |
filtered_tokens.append(t.replace("Ċ", " ")) | |
elif t.startswith("▁"): | |
filtered_tokens.append(" " + t[1:]) | |
else: | |
filtered_tokens.append(t) | |
return [t.rstrip("##").rstrip("@@") for t in filtered_tokens] | |
elif hasattr(tokenizer, "encode"): | |
token_ids = tokenizer.encode(s) | |
return [tokenizer.decode([token_id]) for token_id in token_ids] | |
else: | |
raise ValueError("Unsupported tokenizer type") | |
def get_uzh_color(index): | |
"""Get a color from the UZH color palette based on index.""" | |
return UZH_COLORS[index % len(UZH_COLORS)] | |
def visualize_tokens(text: str, tokenizers: Dict[str, object]): | |
""" | |
Tokenize text with each tokenizer and visualize the tokens with colors. | |
Colors are consistent across tokenizers for the same token sequences. | |
Colors are deterministic based on token content. | |
Args: | |
text: The input text to tokenize | |
tokenizers: Dictionary of tokenizers | |
Returns: | |
Dictionary mapping tokenizer names to HTML visualizations | |
""" | |
results = {} | |
# First pass: collect all unique tokens across all tokenizers | |
all_tokens = set() | |
for tokenizer in tokenizers.values(): | |
tokens = tokenize(text, tokenizer) | |
all_tokens.update(tokens) | |
# Generate colors for all unique tokens using hash-based approach | |
token_colors = {} | |
for token in all_tokens: | |
# Use hash of token to get a deterministic index | |
token_hash = hash(token) | |
# Ensure positive index and wrap around to color list length | |
index = abs(token_hash) % len(UZH_COLORS) | |
token_colors[token] = get_uzh_color(index) | |
# Second pass: create visualizations using the consistent colors | |
for name, tokenizer in tokenizers.items(): | |
tokens = tokenize(text, tokenizer) | |
# Create a colored visualization | |
html = "" | |
# Build the HTML with colored spans for each token | |
for token in tokens: | |
color = token_colors[token] | |
html += f'<span style="background-color: {color}; padding: 2px; margin: 1px; border-radius: 3px;">{token}</span>' | |
results[name] = html | |
return results | |