File size: 6,642 Bytes
a35d485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35e7f94
 
 
 
 
a35d485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35e7f94
a35d485
 
 
 
 
 
 
 
35e7f94
a35d485
 
 
 
 
 
 
 
 
 
35e7f94
a35d485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import gradio as gr
import utils
import os

# Create a custom theme
theme = gr.themes.Base(
    text_size="lg",
    radius_size="none",
    font=[gr.themes.GoogleFont('Source Sans 3'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
)

# Load tokenizers only once during development
if gr.NO_RELOAD:
    print("Loading tokenizers...")
    all_tokenizers = utils.load_tokenizers()
    all_tokenizer_names = list(all_tokenizers.keys())
    print("Tokenizers loaded!")

def read_svg_file(name: str) -> str:
    """Read SVG file content."""
    icon_map = {
        "meta-llama/Llama-4-Scout-17B-16E-Instruct": "llama.svg",
        "deepseek-ai/DeepSeek-V3-0324": "deepseek.svg",
        "ZurichNLP/swissbert": "swissbert.svg",
        "mistralai/Mistral-Nemo-Instruct-2407": "mistral.svg",
        "google/gemma-3-27b-it": "gemma.svg",
        "gpt-4o": "chatgpt.svg"
    }
    icon_path = os.path.join("icons", icon_map.get(name, "chatgpt.svg"))
    try:
        with open(icon_path, 'r') as f:
            return f.read()
    except Exception as e:
        print(f"Error reading SVG file {icon_path}: {e}")
        return ""

def get_model_icon(name: str) -> str:
    """Get the HTML for the model icon."""
    # Skip icons for collapsed models
    if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
        return ""
        
    svg_content = read_svg_file(name)
    if svg_content:
        # Add viewBox and preserve aspect ratio to the SVG element
        svg_content = svg_content.replace('<svg', '<svg preserveAspectRatio="xMidYMid meet" style="height: 24px; width: 24px;"')
        # Wrap in a container that maintains aspect ratio
        return f'<div style="display: inline-block; vertical-align: middle; margin-right: 8px; height: 24px; width: 24px; overflow: hidden;">{svg_content}</div>'
    return ""

def process_text(text):
    """Process the input text and return visualizations for all tokenizers."""
    # Use the pre-loaded tokenizers
    visualizations = utils.visualize_tokens(text, all_tokenizers)
    return list(visualizations.values()) + [gr.update(visible=True)]

# Create the Gradio interface
with gr.Blocks(title="Tokens matter.", theme=theme, css="""
    .tokenizer-panel > div { background: var(--input-background-fill); }
    .no-padding { padding: 0 !important; }
    .form { border: 0 !important; }
    .html-container { line-height: 2em; !important; }
    .pending { opacity: 1; }

    @media (prefers-color-scheme: dark) {
        .gradio-container.gradio-container-5-29-0 .contain .html-container span.model-name { color: white !important; }
        .html-container span { color: black !important; }
    }
""") as demo:
    gr.Markdown("# Tokens matter.")
    
    with gr.Row():
        # Left column for inputs
        with gr.Column(scale=1):
            input_text = gr.Textbox(
                label="Input Text:",
                placeholder="Enter text to tokenize ...",
                value="Als Zürcher bini nöd so Fan vom FC Basel.",
                lines=3,
                elem_classes="no-padding",
                interactive=True,
                every=True,  # This enables real-time updates
            )
        
        # Right column for outputs
        with gr.Column(scale=2):
            # Create output boxes for main tokenizers
            main_output_boxes = []
            more_output_boxes = []
            
            # Create 2x2 grid for main tokenizers
            with gr.Row():
                with gr.Column():
                    for name in all_tokenizer_names[:2]:
                        if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
                            continue
                        display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
                        with gr.Group(elem_classes="tokenizer-panel"):
                            gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name"  style="font-weight: bold;">{display_name}</span></div>')
                            box = gr.HTML()
                            main_output_boxes.append(box)
                with gr.Column():
                    for name in all_tokenizer_names[2:4]:
                        if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
                            continue
                        display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
                        with gr.Group(elem_classes="tokenizer-panel"):
                            gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name"  style="font-weight: bold;">{display_name}</span></div>')
                            box = gr.HTML()
                            main_output_boxes.append(box)
            
            # Display more tokenizers in accordion
            more_models = gr.Accordion("More Models", open=False, visible=False)
            with more_models:
                for name in all_tokenizer_names:
                    if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
                        display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
                        with gr.Group(elem_classes="tokenizer-panel"):
                            gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name"  style="font-weight: bold;">{display_name}</span></div>')
                            box = gr.HTML()
                            more_output_boxes.append(box)
    
    all_outputs = main_output_boxes + more_output_boxes + [more_models]
    
    # Use change event for real-time updates
    input_text.change(
        fn=process_text,
        inputs=[input_text],
        outputs=all_outputs,
        show_progress="hidden",
    )

    # Add examples
    gr.Examples(
        examples=[
            ["Als Zürcher bini nöd so Fan vom FC Basel."],
            ["Als Zürcher bin ich nicht sonderlich Fan des FC Basel."],
            ["En tant que Zurichois, je ne suis pas un grand fan du FC Bâle."],
            ["Come Zurighese, non sono un grande fan del FC Basilea."],
            ["Sco Turitgais na sun jau betg in grond fan da l'FC Basilea."],
            ["As a Zurich resident, I am not a big fan of FC Basel."],
        ],
        inputs=input_text
    )

if __name__ == "__main__":
    demo.launch()