Spaces:
Running
Running
File size: 6,642 Bytes
a35d485 35e7f94 a35d485 35e7f94 a35d485 35e7f94 a35d485 35e7f94 a35d485 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import gradio as gr
import utils
import os
# Create a custom theme
theme = gr.themes.Base(
text_size="lg",
radius_size="none",
font=[gr.themes.GoogleFont('Source Sans 3'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
)
# Load tokenizers only once during development
if gr.NO_RELOAD:
print("Loading tokenizers...")
all_tokenizers = utils.load_tokenizers()
all_tokenizer_names = list(all_tokenizers.keys())
print("Tokenizers loaded!")
def read_svg_file(name: str) -> str:
"""Read SVG file content."""
icon_map = {
"meta-llama/Llama-4-Scout-17B-16E-Instruct": "llama.svg",
"deepseek-ai/DeepSeek-V3-0324": "deepseek.svg",
"ZurichNLP/swissbert": "swissbert.svg",
"mistralai/Mistral-Nemo-Instruct-2407": "mistral.svg",
"google/gemma-3-27b-it": "gemma.svg",
"gpt-4o": "chatgpt.svg"
}
icon_path = os.path.join("icons", icon_map.get(name, "chatgpt.svg"))
try:
with open(icon_path, 'r') as f:
return f.read()
except Exception as e:
print(f"Error reading SVG file {icon_path}: {e}")
return ""
def get_model_icon(name: str) -> str:
"""Get the HTML for the model icon."""
# Skip icons for collapsed models
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
return ""
svg_content = read_svg_file(name)
if svg_content:
# Add viewBox and preserve aspect ratio to the SVG element
svg_content = svg_content.replace('<svg', '<svg preserveAspectRatio="xMidYMid meet" style="height: 24px; width: 24px;"')
# Wrap in a container that maintains aspect ratio
return f'<div style="display: inline-block; vertical-align: middle; margin-right: 8px; height: 24px; width: 24px; overflow: hidden;">{svg_content}</div>'
return ""
def process_text(text):
"""Process the input text and return visualizations for all tokenizers."""
# Use the pre-loaded tokenizers
visualizations = utils.visualize_tokens(text, all_tokenizers)
return list(visualizations.values()) + [gr.update(visible=True)]
# Create the Gradio interface
with gr.Blocks(title="Tokens matter.", theme=theme, css="""
.tokenizer-panel > div { background: var(--input-background-fill); }
.no-padding { padding: 0 !important; }
.form { border: 0 !important; }
.html-container { line-height: 2em; !important; }
.pending { opacity: 1; }
@media (prefers-color-scheme: dark) {
.gradio-container.gradio-container-5-29-0 .contain .html-container span.model-name { color: white !important; }
.html-container span { color: black !important; }
}
""") as demo:
gr.Markdown("# Tokens matter.")
with gr.Row():
# Left column for inputs
with gr.Column(scale=1):
input_text = gr.Textbox(
label="Input Text:",
placeholder="Enter text to tokenize ...",
value="Als Zürcher bini nöd so Fan vom FC Basel.",
lines=3,
elem_classes="no-padding",
interactive=True,
every=True, # This enables real-time updates
)
# Right column for outputs
with gr.Column(scale=2):
# Create output boxes for main tokenizers
main_output_boxes = []
more_output_boxes = []
# Create 2x2 grid for main tokenizers
with gr.Row():
with gr.Column():
for name in all_tokenizer_names[:2]:
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
continue
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
with gr.Group(elem_classes="tokenizer-panel"):
gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name" style="font-weight: bold;">{display_name}</span></div>')
box = gr.HTML()
main_output_boxes.append(box)
with gr.Column():
for name in all_tokenizer_names[2:4]:
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
continue
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
with gr.Group(elem_classes="tokenizer-panel"):
gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name" style="font-weight: bold;">{display_name}</span></div>')
box = gr.HTML()
main_output_boxes.append(box)
# Display more tokenizers in accordion
more_models = gr.Accordion("More Models", open=False, visible=False)
with more_models:
for name in all_tokenizer_names:
if name in ["google/gemma-3-27b-it", "mistralai/Mistral-Nemo-Instruct-2407", "CohereLabs/aya-expanse-8b"]:
display_name = utils.MODEL_DISPLAY_NAMES.get(name, name)
with gr.Group(elem_classes="tokenizer-panel"):
gr.HTML(value=f'<div style="display: flex; align-items: center; margin-bottom: 8px;">{get_model_icon(name)}<span class="model-name" style="font-weight: bold;">{display_name}</span></div>')
box = gr.HTML()
more_output_boxes.append(box)
all_outputs = main_output_boxes + more_output_boxes + [more_models]
# Use change event for real-time updates
input_text.change(
fn=process_text,
inputs=[input_text],
outputs=all_outputs,
show_progress="hidden",
)
# Add examples
gr.Examples(
examples=[
["Als Zürcher bini nöd so Fan vom FC Basel."],
["Als Zürcher bin ich nicht sonderlich Fan des FC Basel."],
["En tant que Zurichois, je ne suis pas un grand fan du FC Bâle."],
["Come Zurighese, non sono un grande fan del FC Basilea."],
["Sco Turitgais na sun jau betg in grond fan da l'FC Basilea."],
["As a Zurich resident, I am not a big fan of FC Basel."],
],
inputs=input_text
)
if __name__ == "__main__":
demo.launch()
|