Spaces:

ZurichNLP
/

subword-tokenization

Running

File size: 7,978 Bytes

from typing import Dict, List, Tuple
from pathlib import Path

from transformers import AutoTokenizer
import tiktoken

# UZH color palette
UZH_COLORS = [
    "#BACBFF",  # UZH Blue V1
    "#DBF4F9",  # UZH Cyan V1
    "#ECF6D6",  # UZH Apple V1
    "#FFF4DA",  # UZH Gold V1
    "#FFDBCC",  # UZH Orange V1
    "#FBC6D4",  # UZH Berry V1
    "#C2C2C2",  # UZH Grey V1
    "#FAFAFA",  # UZH Light Grey V1
    "#7596FF",  # UZH Blue V2
    "#B7E9F4",  # UZH Cyan V2
    "#DBEDAD",  # UZH Apple V2
    "#FFE9B5",  # UZH Gold V2
    "#FEB799",  # UZH Orange V2
    "#F78CAA",  # UZH Berry V2
    "#A3A3A3",  # UZH Grey V2
    "#EFEFEF",  # UZH Light Grey V2
]

def load_hf_tokenizer(name: str) -> Tuple[str, object]:
    """
    Load a single HuggingFace tokenizer.
    
    Args:
        name: The name of the tokenizer to load
        
    Returns:
        Tuple of (tokenizer_name, tokenizer_object)
    """
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            name,
            use_fast=True,
            model_max_length=1000000,
            clean_up_tokenization_spaces=True,
            legacy=False
        )
    except Exception as e:
        tokenizer = AutoTokenizer.from_pretrained(
            name,
            model_max_length=1000000,
            clean_up_tokenization_spaces=True,
            legacy=False
        )
    return name, tokenizer

def load_openai_tokenizer(name: str) -> Tuple[str, object]:
    """
    Load a single OpenAI tokenizer.
    
    Args:
        name: The name of the tokenizer to load
        
    Returns:
        Tuple of (tokenizer_name, tokenizer_object)
    """
    return name, tiktoken.encoding_for_model(name)

def load_gsw_tokenizer() -> Tuple[str, object]:
    """
    Load a Swiss German (GSW) tokenizer from local vocabulary files in gsw_tokenizer directory.
    
    Returns:
        Tuple of (tokenizer_name, tokenizer_object)
    """
    tokenizer = AutoTokenizer.from_pretrained("jvamvas/swissbert-gsw-vocab")
    return "swissbert-gsw", tokenizer

def load_tokenizers() -> Dict[str, object]:
    """
    Load all tokenizers.
    
    Returns:
        Dictionary mapping tokenizer names to tokenizer objects
    """
    tokenizers = {}
    
    # Load OpenAI tokenizers first
    openai_names = ["gpt-4o"]
    for name in openai_names:
        tokenizer_name, tokenizer = load_openai_tokenizer(name)
        tokenizers[tokenizer_name] = tokenizer
    
    # Load HuggingFace tokenizers in specified order
    hf_names = [
        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
        "deepseek-ai/DeepSeek-V3-0324",
        "ZurichNLP/swissbert",
        "google/gemma-3-27b-it",
        "mistralai/Mistral-Nemo-Instruct-2407",
        "CohereLabs/aya-expanse-8b",
    ]
    for name in hf_names:
        tokenizer_name, tokenizer = load_hf_tokenizer(name)
        tokenizers[tokenizer_name] = tokenizer
    
    return tokenizers

# Mapping of model names to display names
MODEL_DISPLAY_NAMES = {
    "meta-llama/Llama-4-Scout-17B-16E-Instruct": "Llama 4",
    "deepseek-ai/DeepSeek-V3-0324": "DeepSeek V3",
    "ZurichNLP/swissbert": "SwissBERT 🇨🇭",
    "mistralai/Mistral-Nemo-Instruct-2407": "Mistral NeMo",
    "google/gemma-3-27b-it": "Gemma 3",
    "gpt-4o": "ChatGPT (gpt-4o)",
    "CohereLabs/aya-expanse-8b": "Aya Expanse"
}

def tokenize(s: str, tokenizer) -> List[str]:
    """
    Tokenize a string using any tokenizer from load_hf_tokenizers() or load_openai_tokenizers().
    For SwissBERT tokenizer, compares both SwissBERT and SwissBERT-GSW tokenizations and returns the shorter one.
    
    Args:
        s: The string to tokenize
        tokenizer: A tokenizer from load_hf_tokenizers() or load_openai_tokenizers()
        
    Returns:
        A list of tokens, with special tokens removed and any tail token markers (## or @@) removed
    """
    # Special handling for SwissBERT tokenizer
    if hasattr(tokenizer, "name_or_path") and "swissbert" in tokenizer.name_or_path.lower():
        # Get SwissBERT-GSW tokenizer
        _, gsw_tokenizer = load_gsw_tokenizer()
        
        # Get tokenizations from both tokenizers§
        swissbert_tokens = _tokenize_with_tokenizer(s, tokenizer)
        gsw_tokens = _tokenize_with_tokenizer(s, gsw_tokenizer)
        
        # Return the shorter tokenization
        shorter_tokens = swissbert_tokens if len(swissbert_tokens) <= len(gsw_tokens) else gsw_tokens
        if len(shorter_tokens) > 0 and shorter_tokens[0].startswith(" "):
            shorter_tokens[0] = shorter_tokens[0][1:]
        return shorter_tokens
    
    return _tokenize_with_tokenizer(s, tokenizer)

def _tokenize_with_tokenizer(s: str, tokenizer) -> List[str]:
    """
    Internal helper function to tokenize a string with a given tokenizer.
    
    Args:
        s: The string to tokenize
        tokenizer: A tokenizer object
        
    Returns:
        A list of tokens, with special tokens removed and any tail token markers (## or @@) removed
    """
    if hasattr(tokenizer, "tokenize"):
        encoded = tokenizer.encode(s, add_special_tokens=False)
        if hasattr(tokenizer, "name_or_path") and any(name in tokenizer.name_or_path.lower() for name in ["llama", "deepseek", "mistral", "aya"]):
            tokens = [tokenizer.decode([token_id], skip_special_tokens=False) for token_id in encoded]
        else:
            tokens = tokenizer.convert_ids_to_tokens(encoded)
        
        filtered_tokens = []
        for t in tokens:
            if t.startswith("<") or t.startswith("["):
                continue
            elif "Ġ" in t:
                filtered_tokens.append(t.replace("Ġ", " "))
            elif "Ċ" in t:
                filtered_tokens.append(t.replace("Ċ", " "))
            elif t.startswith("▁"):
                filtered_tokens.append(" " + t[1:])
            else:
                filtered_tokens.append(t)
        
        return [t.rstrip("##").rstrip("@@") for t in filtered_tokens]
    
    elif hasattr(tokenizer, "encode"):
        token_ids = tokenizer.encode(s)
        return [tokenizer.decode([token_id]) for token_id in token_ids]
    
    else:
        raise ValueError("Unsupported tokenizer type")

def get_uzh_color(index):
    """Get a color from the UZH color palette based on index."""
    return UZH_COLORS[index % len(UZH_COLORS)]

def visualize_tokens(text: str, tokenizers: Dict[str, object]):
    """
    Tokenize text with each tokenizer and visualize the tokens with colors.
    Colors are consistent across tokenizers for the same token sequences.
    Colors are deterministic based on token content.
    
    Args:
        text: The input text to tokenize
        tokenizers: Dictionary of tokenizers
        
    Returns:
        Dictionary mapping tokenizer names to HTML visualizations
    """
    results = {}
    
    # First pass: collect all unique tokens across all tokenizers
    all_tokens = set()
    for tokenizer in tokenizers.values():
        tokens = tokenize(text, tokenizer)
        all_tokens.update(tokens)
    
    # Generate colors for all unique tokens using hash-based approach
    token_colors = {}
    for token in all_tokens:
        # Use hash of token to get a deterministic index
        token_hash = hash(token)
        # Ensure positive index and wrap around to color list length
        index = abs(token_hash) % len(UZH_COLORS)
        token_colors[token] = get_uzh_color(index)
    
    # Second pass: create visualizations using the consistent colors
    for name, tokenizer in tokenizers.items():
        tokens = tokenize(text, tokenizer)
        
        # Create a colored visualization
        html = ""
        
        # Build the HTML with colored spans for each token
        for token in tokens:
            color = token_colors[token]
            html += f'<span style="background-color: {color}; padding: 2px; margin: 1px; border-radius: 3px;">{token}</span>'
        
        results[name] = html
    
    return results