Spaces:

ZurichNLP
/

subword-tokenization

Running

App Files Files Community

subword-tokenization / utils.py

jvamvas

Use gsw vocab from hf hub

1390fb6 26 days ago

raw

history blame contribute delete

7.98 kB

	from typing import Dict, List, Tuple
	from pathlib import Path

	from transformers import AutoTokenizer
	import tiktoken

	# UZH color palette
	UZH_COLORS = [
	"#BACBFF", # UZH Blue V1
	"#DBF4F9", # UZH Cyan V1
	"#ECF6D6", # UZH Apple V1
	"#FFF4DA", # UZH Gold V1
	"#FFDBCC", # UZH Orange V1
	"#FBC6D4", # UZH Berry V1
	"#C2C2C2", # UZH Grey V1
	"#FAFAFA", # UZH Light Grey V1
	"#7596FF", # UZH Blue V2
	"#B7E9F4", # UZH Cyan V2
	"#DBEDAD", # UZH Apple V2
	"#FFE9B5", # UZH Gold V2
	"#FEB799", # UZH Orange V2
	"#F78CAA", # UZH Berry V2
	"#A3A3A3", # UZH Grey V2
	"#EFEFEF", # UZH Light Grey V2
	]

	def load_hf_tokenizer(name: str) -> Tuple[str, object]:
	"""
	Load a single HuggingFace tokenizer.

	Args:
	name: The name of the tokenizer to load

	Returns:
	Tuple of (tokenizer_name, tokenizer_object)
	"""
	try:
	tokenizer = AutoTokenizer.from_pretrained(
	name,
	use_fast=True,
	model_max_length=1000000,
	clean_up_tokenization_spaces=True,
	legacy=False
	)
	except Exception as e:
	tokenizer = AutoTokenizer.from_pretrained(
	name,
	model_max_length=1000000,
	clean_up_tokenization_spaces=True,
	legacy=False
	)
	return name, tokenizer

	def load_openai_tokenizer(name: str) -> Tuple[str, object]:
	"""
	Load a single OpenAI tokenizer.

	Args:
	name: The name of the tokenizer to load

	Returns:
	Tuple of (tokenizer_name, tokenizer_object)
	"""
	return name, tiktoken.encoding_for_model(name)

	def load_gsw_tokenizer() -> Tuple[str, object]:
	"""
	Load a Swiss German (GSW) tokenizer from local vocabulary files in gsw_tokenizer directory.

	Returns:
	Tuple of (tokenizer_name, tokenizer_object)
	"""
	tokenizer = AutoTokenizer.from_pretrained("jvamvas/swissbert-gsw-vocab")
	return "swissbert-gsw", tokenizer

	def load_tokenizers() -> Dict[str, object]:
	"""
	Load all tokenizers.

	Returns:
	Dictionary mapping tokenizer names to tokenizer objects
	"""
	tokenizers = {}

	# Load OpenAI tokenizers first
	openai_names = ["gpt-4o"]
	for name in openai_names:
	tokenizer_name, tokenizer = load_openai_tokenizer(name)
	tokenizers[tokenizer_name] = tokenizer

	# Load HuggingFace tokenizers in specified order
	hf_names = [
	"meta-llama/Llama-4-Scout-17B-16E-Instruct",
	"deepseek-ai/DeepSeek-V3-0324",
	"ZurichNLP/swissbert",
	"google/gemma-3-27b-it",
	"mistralai/Mistral-Nemo-Instruct-2407",
	"CohereLabs/aya-expanse-8b",
	]
	for name in hf_names:
	tokenizer_name, tokenizer = load_hf_tokenizer(name)
	tokenizers[tokenizer_name] = tokenizer

	return tokenizers

	# Mapping of model names to display names
	MODEL_DISPLAY_NAMES = {
	"meta-llama/Llama-4-Scout-17B-16E-Instruct": "Llama 4",
	"deepseek-ai/DeepSeek-V3-0324": "DeepSeek V3",
	"ZurichNLP/swissbert": "SwissBERT 🇨🇭",
	"mistralai/Mistral-Nemo-Instruct-2407": "Mistral NeMo",
	"google/gemma-3-27b-it": "Gemma 3",
	"gpt-4o": "ChatGPT (gpt-4o)",
	"CohereLabs/aya-expanse-8b": "Aya Expanse"
	}

	def tokenize(s: str, tokenizer) -> List[str]:
	"""
	Tokenize a string using any tokenizer from load_hf_tokenizers() or load_openai_tokenizers().
	For SwissBERT tokenizer, compares both SwissBERT and SwissBERT-GSW tokenizations and returns the shorter one.

	Args:
	s: The string to tokenize
	tokenizer: A tokenizer from load_hf_tokenizers() or load_openai_tokenizers()

	Returns:
	A list of tokens, with special tokens removed and any tail token markers (## or @@) removed
	"""
	# Special handling for SwissBERT tokenizer
	if hasattr(tokenizer, "name_or_path") and "swissbert" in tokenizer.name_or_path.lower():
	# Get SwissBERT-GSW tokenizer
	_, gsw_tokenizer = load_gsw_tokenizer()

	# Get tokenizations from both tokenizers§
	swissbert_tokens = _tokenize_with_tokenizer(s, tokenizer)
	gsw_tokens = _tokenize_with_tokenizer(s, gsw_tokenizer)

	# Return the shorter tokenization
	shorter_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens
	if len(shorter_tokens) > 0 and shorter_tokens[0].startswith(" "):
	shorter_tokens[0] = shorter_tokens[0][1:]
	return shorter_tokens

	return _tokenize_with_tokenizer(s, tokenizer)

	def _tokenize_with_tokenizer(s: str, tokenizer) -> List[str]:
	"""
	Internal helper function to tokenize a string with a given tokenizer.

	Args:
	s: The string to tokenize
	tokenizer: A tokenizer object

	Returns:
	A list of tokens, with special tokens removed and any tail token markers (## or @@) removed
	"""
	if hasattr(tokenizer, "tokenize"):
	encoded = tokenizer.encode(s, add_special_tokens=False)
	if hasattr(tokenizer, "name_or_path") and any(name in tokenizer.name_or_path.lower() for name in ["llama", "deepseek", "mistral", "aya"]):
	tokens = [tokenizer.decode([token_id], skip_special_tokens=False) for token_id in encoded]
	else:
	tokens = tokenizer.convert_ids_to_tokens(encoded)

	filtered_tokens = []
	for t in tokens:
	if t.startswith("<") or t.startswith("["):
	continue
	elif "Ġ" in t:
	filtered_tokens.append(t.replace("Ġ", " "))
	elif "Ċ" in t:
	filtered_tokens.append(t.replace("Ċ", " "))
	elif t.startswith("▁"):
	filtered_tokens.append(" " + t[1:])
	else:
	filtered_tokens.append(t)

	return [t.rstrip("##").rstrip("@@") for t in filtered_tokens]

	elif hasattr(tokenizer, "encode"):
	token_ids = tokenizer.encode(s)
	return [tokenizer.decode([token_id]) for token_id in token_ids]

	else:
	raise ValueError("Unsupported tokenizer type")

	def get_uzh_color(index):
	"""Get a color from the UZH color palette based on index."""
	return UZH_COLORS[index % len(UZH_COLORS)]

	def visualize_tokens(text: str, tokenizers: Dict[str, object]):
	"""
	Tokenize text with each tokenizer and visualize the tokens with colors.
	Colors are consistent across tokenizers for the same token sequences.
	Colors are deterministic based on token content.

	Args:
	text: The input text to tokenize
	tokenizers: Dictionary of tokenizers

	Returns:
	Dictionary mapping tokenizer names to HTML visualizations
	"""
	results = {}

	# First pass: collect all unique tokens across all tokenizers
	all_tokens = set()
	for tokenizer in tokenizers.values():
	tokens = tokenize(text, tokenizer)
	all_tokens.update(tokens)

	# Generate colors for all unique tokens using hash-based approach
	token_colors = {}
	for token in all_tokens:
	# Use hash of token to get a deterministic index
	token_hash = hash(token)
	# Ensure positive index and wrap around to color list length
	index = abs(token_hash) % len(UZH_COLORS)
	token_colors[token] = get_uzh_color(index)

	# Second pass: create visualizations using the consistent colors
	for name, tokenizer in tokenizers.items():
	tokens = tokenize(text, tokenizer)

	# Create a colored visualization
	html = ""

	# Build the HTML with colored spans for each token
	for token in tokens:
	color = token_colors[token]
	html += f'<span style="background-color: {color}; padding: 2px; margin: 1px; border-radius: 3px;">{token}</span>'

	results[name] = html

	return results