import gradio as gr import utils import os # Create a custom theme theme = gr.themes.Base( text_size="lg", radius_size="none", font=[gr.themes.GoogleFont('Source Sans 3'), 'ui-sans-serif', 'system-ui', 'sans-serif'], ) # Load tokenizers only once during development if gr.NO_RELOAD: print("Loading tokenizers...") all_tokenizers = utils.load_tokenizers() all_tokenizer_names = list(all_tokenizers.keys()) print("Tokenizers loaded!") def read_svg_file(name: str) -> str: """Read SVG file content.""" icon_map = { "meta-llama/Llama-4-Scout-17B-16E-Instruct": "llama.svg", "deepseek-ai/DeepSeek-V3-0324": "deepseek.svg", "ZurichNLP/swissbert": "swissbert.svg", "mistralai/Mistral-Nemo-Instruct-2407": "mistral.svg", "google/gemma-3-27b-it": "gemma.svg", "gpt-4o": "chatgpt.svg" } icon_path = os.path.join("icons", icon_map.get(name, "chatgpt.svg")) try: with open(icon_path, 'r') as f: return f.read() except Exception as e: print(f"Error reading SVG file {icon_path}: {e}") return "" def get_model_icon(name: str) -> str: """Get the HTML for the model icon.""" # Skip icons for collapsed models if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]: return "" svg_content = read_svg_file(name) if svg_content: # Add viewBox and preserve aspect ratio to the SVG element svg_content = svg_content.replace('{svg_content}' return "" def process_text(text): """Process the input text and return visualizations for all tokenizers.""" # Use the pre-loaded tokenizers visualizations = utils.visualize_tokens(text, all_tokenizers) return list(visualizations.values()) + [gr.update(visible=True)] # Create the Gradio interface with gr.Blocks(title="Tokens matter.", theme=theme, css=""" .tokenizer-panel > div { background: var(--input-background-fill); } .no-padding { padding: 0 !important; } .form { border: 0 !important; } .html-container { line-height: 2em; !important; } .pending { opacity: 1; } @media (prefers-color-scheme: dark) { .gradio-container.gradio-container-5-29-0 .contain .html-container span.model-name { color: white !important; } .html-container span { color: black !important; } } """) as demo: gr.Markdown("# Tokens matter.") with gr.Row(): # Left column for inputs with gr.Column(scale=1): input_text = gr.Textbox( label="Input Text:", placeholder="Enter text to tokenize ...", value="Als Zürcher bini nöd so Fan vom FC Basel.", lines=3, elem_classes="no-padding", interactive=True, every=True, # This enables real-time updates ) # Right column for outputs with gr.Column(scale=2): # Create output boxes for main tokenizers main_output_boxes = [] more_output_boxes = [] # Create 2x2 grid for main tokenizers with gr.Row(): with gr.Column(): for name in all_tokenizer_names[:2]: if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]: continue display_name = utils.MODEL_DISPLAY_NAMES.get(name, name) with gr.Group(elem_classes="tokenizer-panel"): gr.HTML(value=f'
{get_model_icon(name)}{display_name}
') box = gr.HTML() main_output_boxes.append(box) with gr.Column(): for name in all_tokenizer_names[2:4]: if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]: continue display_name = utils.MODEL_DISPLAY_NAMES.get(name, name) with gr.Group(elem_classes="tokenizer-panel"): gr.HTML(value=f'
{get_model_icon(name)}{display_name}
') box = gr.HTML() main_output_boxes.append(box) # Display more tokenizers in accordion more_models = gr.Accordion("More Models", open=False, visible=False) with more_models: for name in all_tokenizer_names: if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]: display_name = utils.MODEL_DISPLAY_NAMES.get(name, name) with gr.Group(elem_classes="tokenizer-panel"): gr.HTML(value=f'
{get_model_icon(name)}{display_name}
') box = gr.HTML() more_output_boxes.append(box) all_outputs = main_output_boxes + more_output_boxes + [more_models] # Use change event for real-time updates input_text.change( fn=process_text, inputs=[input_text], outputs=all_outputs, show_progress="hidden", ) # Add examples gr.Examples( examples=[ ["Als Zürcher bini nöd so Fan vom FC Basel."], ["Als Zürcher bin ich nicht sonderlich Fan des FC Basel."], ["En tant que Zurichois, je ne suis pas un grand fan du FC Bâle."], ["Come Zurighese, non sono un grande fan del FC Basilea."], ["Sco Turitgais na sun jau betg in grond fan da l'FC Basilea."], ["As a Zurich resident, I am not a big fan of FC Basel."], ], inputs=input_text ) if __name__ == "__main__": demo.launch()