Spaces:
Running
Running
File size: 7,978 Bytes
a35d485 1390fb6 a35d485 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 |
from typing import Dict, List, Tuple
from pathlib import Path
from transformers import AutoTokenizer
import tiktoken
# UZH color palette
UZH_COLORS = [
"#BACBFF", # UZH Blue V1
"#DBF4F9", # UZH Cyan V1
"#ECF6D6", # UZH Apple V1
"#FFF4DA", # UZH Gold V1
"#FFDBCC", # UZH Orange V1
"#FBC6D4", # UZH Berry V1
"#C2C2C2", # UZH Grey V1
"#FAFAFA", # UZH Light Grey V1
"#7596FF", # UZH Blue V2
"#B7E9F4", # UZH Cyan V2
"#DBEDAD", # UZH Apple V2
"#FFE9B5", # UZH Gold V2
"#FEB799", # UZH Orange V2
"#F78CAA", # UZH Berry V2
"#A3A3A3", # UZH Grey V2
"#EFEFEF", # UZH Light Grey V2
]
def load_hf_tokenizer(name: str) -> Tuple[str, object]:
"""
Load a single HuggingFace tokenizer.
Args:
name: The name of the tokenizer to load
Returns:
Tuple of (tokenizer_name, tokenizer_object)
"""
try:
tokenizer = AutoTokenizer.from_pretrained(
name,
use_fast=True,
model_max_length=1000000,
clean_up_tokenization_spaces=True,
legacy=False
)
except Exception as e:
tokenizer = AutoTokenizer.from_pretrained(
name,
model_max_length=1000000,
clean_up_tokenization_spaces=True,
legacy=False
)
return name, tokenizer
def load_openai_tokenizer(name: str) -> Tuple[str, object]:
"""
Load a single OpenAI tokenizer.
Args:
name: The name of the tokenizer to load
Returns:
Tuple of (tokenizer_name, tokenizer_object)
"""
return name, tiktoken.encoding_for_model(name)
def load_gsw_tokenizer() -> Tuple[str, object]:
"""
Load a Swiss German (GSW) tokenizer from local vocabulary files in gsw_tokenizer directory.
Returns:
Tuple of (tokenizer_name, tokenizer_object)
"""
tokenizer = AutoTokenizer.from_pretrained("jvamvas/swissbert-gsw-vocab")
return "swissbert-gsw", tokenizer
def load_tokenizers() -> Dict[str, object]:
"""
Load all tokenizers.
Returns:
Dictionary mapping tokenizer names to tokenizer objects
"""
tokenizers = {}
# Load OpenAI tokenizers first
openai_names = ["gpt-4o"]
for name in openai_names:
tokenizer_name, tokenizer = load_openai_tokenizer(name)
tokenizers[tokenizer_name] = tokenizer
# Load HuggingFace tokenizers in specified order
hf_names = [
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"deepseek-ai/DeepSeek-V3-0324",
"ZurichNLP/swissbert",
"google/gemma-3-27b-it",
"mistralai/Mistral-Nemo-Instruct-2407",
"CohereLabs/aya-expanse-8b",
]
for name in hf_names:
tokenizer_name, tokenizer = load_hf_tokenizer(name)
tokenizers[tokenizer_name] = tokenizer
return tokenizers
# Mapping of model names to display names
MODEL_DISPLAY_NAMES = {
"meta-llama/Llama-4-Scout-17B-16E-Instruct": "Llama 4",
"deepseek-ai/DeepSeek-V3-0324": "DeepSeek V3",
"ZurichNLP/swissbert": "SwissBERT 🇨🇭",
"mistralai/Mistral-Nemo-Instruct-2407": "Mistral NeMo",
"google/gemma-3-27b-it": "Gemma 3",
"gpt-4o": "ChatGPT (gpt-4o)",
"CohereLabs/aya-expanse-8b": "Aya Expanse"
}
def tokenize(s: str, tokenizer) -> List[str]:
"""
Tokenize a string using any tokenizer from load_hf_tokenizers() or load_openai_tokenizers().
For SwissBERT tokenizer, compares both SwissBERT and SwissBERT-GSW tokenizations and returns the shorter one.
Args:
s: The string to tokenize
tokenizer: A tokenizer from load_hf_tokenizers() or load_openai_tokenizers()
Returns:
A list of tokens, with special tokens removed and any tail token markers (## or @@) removed
"""
# Special handling for SwissBERT tokenizer
if hasattr(tokenizer, "name_or_path") and "swissbert" in tokenizer.name_or_path.lower():
# Get SwissBERT-GSW tokenizer
_, gsw_tokenizer = load_gsw_tokenizer()
# Get tokenizations from both tokenizers§
swissbert_tokens = _tokenize_with_tokenizer(s, tokenizer)
gsw_tokens = _tokenize_with_tokenizer(s, gsw_tokenizer)
# Return the shorter tokenization
shorter_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens
if len(shorter_tokens) > 0 and shorter_tokens[0].startswith(" "):
shorter_tokens[0] = shorter_tokens[0][1:]
return shorter_tokens
return _tokenize_with_tokenizer(s, tokenizer)
def _tokenize_with_tokenizer(s: str, tokenizer) -> List[str]:
"""
Internal helper function to tokenize a string with a given tokenizer.
Args:
s: The string to tokenize
tokenizer: A tokenizer object
Returns:
A list of tokens, with special tokens removed and any tail token markers (## or @@) removed
"""
if hasattr(tokenizer, "tokenize"):
encoded = tokenizer.encode(s, add_special_tokens=False)
if hasattr(tokenizer, "name_or_path") and any(name in tokenizer.name_or_path.lower() for name in ["llama", "deepseek", "mistral", "aya"]):
tokens = [tokenizer.decode([token_id], skip_special_tokens=False) for token_id in encoded]
else:
tokens = tokenizer.convert_ids_to_tokens(encoded)
filtered_tokens = []
for t in tokens:
if t.startswith("<") or t.startswith("["):
continue
elif "Ġ" in t:
filtered_tokens.append(t.replace("Ġ", " "))
elif "Ċ" in t:
filtered_tokens.append(t.replace("Ċ", " "))
elif t.startswith("▁"):
filtered_tokens.append(" " + t[1:])
else:
filtered_tokens.append(t)
return [t.rstrip("##").rstrip("@@") for t in filtered_tokens]
elif hasattr(tokenizer, "encode"):
token_ids = tokenizer.encode(s)
return [tokenizer.decode([token_id]) for token_id in token_ids]
else:
raise ValueError("Unsupported tokenizer type")
def get_uzh_color(index):
"""Get a color from the UZH color palette based on index."""
return UZH_COLORS[index % len(UZH_COLORS)]
def visualize_tokens(text: str, tokenizers: Dict[str, object]):
"""
Tokenize text with each tokenizer and visualize the tokens with colors.
Colors are consistent across tokenizers for the same token sequences.
Colors are deterministic based on token content.
Args:
text: The input text to tokenize
tokenizers: Dictionary of tokenizers
Returns:
Dictionary mapping tokenizer names to HTML visualizations
"""
results = {}
# First pass: collect all unique tokens across all tokenizers
all_tokens = set()
for tokenizer in tokenizers.values():
tokens = tokenize(text, tokenizer)
all_tokens.update(tokens)
# Generate colors for all unique tokens using hash-based approach
token_colors = {}
for token in all_tokens:
# Use hash of token to get a deterministic index
token_hash = hash(token)
# Ensure positive index and wrap around to color list length
index = abs(token_hash) % len(UZH_COLORS)
token_colors[token] = get_uzh_color(index)
# Second pass: create visualizations using the consistent colors
for name, tokenizer in tokenizers.items():
tokens = tokenize(text, tokenizer)
# Create a colored visualization
html = ""
# Build the HTML with colored spans for each token
for token in tokens:
color = token_colors[token]
html += f'<span style="background-color: {color}; padding: 2px; margin: 1px; border-radius: 3px;">{token}</span>'
results[name] = html
return results
|