MaziyarPanahi's picture
let's add the tsv file
23d64a1
import gradio as gr
import pandas as pd
import numpy as np
from io import StringIO
import os
try:
# Read the local TSV file
df = pd.read_csv("FACTS.tsv", sep='\t')
print(f"Successfully loaded {len(df)} models from local file")
except Exception as e:
print(f"Error loading data from local file: {e}")
# Show sample data when file reading fails
df = pd.DataFrame({
'model': [
'deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
'meta-llama/Llama-3.3-70B-Instruct',
'Qwen/Qwen3-30B-A3B',
'google/gemma-3-27b-it'
],
'size': [14, 70, 30, 27],
'Separate Grounding Score': [0.817797, 0.842553, 0.812766, 0.936],
'Separate Quality Score': [0.542373, 0.510638, 0.540426, 0.391],
'Combined Score': [0.457627, 0.425532, 0.425532, 0.378]
})
print("Showing sample data (file read failed)")
# Clean up the data
df = df.dropna() # Remove any rows with missing values
df.columns = df.columns.str.strip() # Remove any whitespace from column names
# Rename columns to match our expected format
df = df.rename(columns={
'model': 'Model Name',
'size': 'Size'
})
# Create size display format
df["Size_Display"] = df["Size"].apply(
lambda x: f"{int(x)}B" if x == int(x) else f"{x}B"
)
# Add size category for filtering
def get_size_category(size):
if size <= 5:
return "0-5B"
elif size <= 10:
return "5-10B"
elif size <= 20:
return "10-20B"
elif size <= 40:
return "20-40B"
elif size <= 80:
return "40-80B"
else:
return ">80B"
df["Size_Category"] = df["Size"].apply(get_size_category)
def filter_and_search_models(
search_query, size_ranges, sort_by, architecture_filters=None
):
"""Filter and search models based on user inputs"""
filtered_df = df.copy()
# Apply search filter
if search_query:
mask = filtered_df["Model Name"].str.contains(
search_query, case=False, na=False
)
filtered_df = filtered_df[mask]
# Apply size range filter
if size_ranges and len(size_ranges) > 0:
filtered_df = filtered_df[filtered_df["Size_Category"].isin(size_ranges)]
# Apply architecture filter
if architecture_filters and len(architecture_filters) > 0:
architecture_mask = pd.Series(
[False] * len(filtered_df), index=filtered_df.index
)
for arch in architecture_filters:
if arch == "llama":
architecture_mask |= filtered_df["Model Name"].str.contains(
"meta-llama", case=False, na=False
)
elif arch == "deepseek":
architecture_mask |= filtered_df["Model Name"].str.contains(
"deepseek", case=False, na=False
)
elif arch == "qwen":
architecture_mask |= filtered_df["Model Name"].str.contains(
"Qwen", case=False, na=False
)
elif arch == "google":
architecture_mask |= filtered_df["Model Name"].str.contains(
"google", case=False, na=False
)
elif arch == "mistral":
architecture_mask |= filtered_df["Model Name"].str.contains(
"mistralai", case=False, na=False
)
elif arch == "others":
# Include models that don't match any of the main categories
others_mask = ~(
filtered_df["Model Name"].str.contains("meta-llama", case=False, na=False) |
filtered_df["Model Name"].str.contains("deepseek", case=False, na=False) |
filtered_df["Model Name"].str.contains("Qwen", case=False, na=False) |
filtered_df["Model Name"].str.contains("google", case=False, na=False) |
filtered_df["Model Name"].str.contains("mistralai", case=False, na=False)
)
architecture_mask |= others_mask
filtered_df = filtered_df[architecture_mask]
# Sort by selected metric
if sort_by in filtered_df.columns:
filtered_df = filtered_df.sort_values(sort_by, ascending=False)
# Add ranking based on the sorted metric
filtered_df = filtered_df.reset_index(drop=True)
filtered_df["Rank"] = range(1, len(filtered_df) + 1)
# Select columns to display (including Rank and Size)
display_df = filtered_df[
[
"Rank",
"Model Name",
"Size_Display",
"Separate Grounding Score",
"Separate Quality Score",
"Combined Score",
]
]
# Rename Size_Display to Size for cleaner display
display_df = display_df.rename(columns={"Size_Display": "Size"})
# Round numerical values for better display
for col in ["Separate Grounding Score", "Separate Quality Score", "Combined Score"]:
display_df = display_df.copy() # Create a copy to avoid SettingWithCopyWarning
display_df[col] = display_df[col].round(3) # Reduced to 3 decimal places
return display_df
def create_html_table(df):
"""Create an HTML table from the dataframe"""
html = '<div class="leaderboard-container">'
html += '<table class="leaderboard-table">'
# Header
html += "<thead><tr>"
for col in df.columns:
html += f"<th>{col}</th>"
html += "</tr></thead>"
# Body
html += "<tbody>"
for _, row in df.iterrows():
# Add model family class for styling
model_name = row["Model Name"]
row_class = ""
if "meta-llama" in model_name:
row_class = "llama-row"
elif "deepseek" in model_name:
row_class = "deepseek-row"
elif "Qwen" in model_name:
row_class = "qwen-row"
elif "google" in model_name:
row_class = "google-row"
elif "mistralai" in model_name:
row_class = "mistral-row"
else:
row_class = "others-row"
html += f'<tr class="{row_class}">'
for i, col in enumerate(df.columns):
cell_class = ""
if i == 0: # Rank column
cell_class = "rank-cell"
elif i == 1: # Model name
cell_class = "model-cell"
elif i == 2: # Size
cell_class = "size-cell"
else: # Score columns
cell_class = "score-cell"
# Create Hugging Face link for model name
if col == "Model Name":
hf_url = f"https://huggingface.co/{model_name}"
cell_content = f'<a href="{hf_url}" target="_blank" class="model-link">{model_name}</a>'
else:
cell_content = str(row[col])
html += f'<td class="{cell_class}">{cell_content}</td>'
html += "</tr>"
html += "</tbody>"
html += "</table>"
html += "</div>"
return html
# Create the Gradio interface
with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as app:
gr.Markdown("# 🏆 FACTS Grounding Leaderboard")
gr.Markdown(
"### FACTS Medical Grounding is a benchmark designed to evaluate Open Models over medical domain."
)
with gr.Tabs():
with gr.TabItem("Leaderboard"):
# Top section with search and filters
with gr.Row():
# Left side - All Filters
with gr.Column(scale=1):
gr.Markdown("### 🎛️ **Filter & Sort Options**")
# Sort dropdown with modern styling
with gr.Row():
sort_dropdown = gr.Dropdown(
choices=[
("🏆 Combined Score", "Combined Score"),
("🎯 Grounding Score", "Separate Grounding Score"),
("📊 Quality Score", "Separate Quality Score"),
],
value="Combined Score",
label="Sort by Metric",
elem_classes="sort-dropdown-modern",
container=True,
)
# Size filters
gr.Markdown("**📏 Filter by Model Size:**")
size_checkboxes = gr.CheckboxGroup(
choices=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
value=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
label="",
elem_classes="size-filter",
container=False,
)
# Model architecture filters
gr.Markdown("**🏗️ Filter by Model Architecture:**")
architecture_checkboxes = gr.CheckboxGroup(
choices=[
("🤖 DeepSeek", "deepseek"),
("🐧 Qwen", "qwen"),
("🦙 Llama", "llama"),
("🔷 Gemma", "google"),
("🌟 Mistral", "mistral"),
("🔧 Others", "others"),
],
value=["llama", "deepseek", "qwen", "google", "mistral", "others"],
label="",
elem_classes="architecture-filter",
container=False,
)
# Right side - Search
with gr.Column(scale=1):
gr.Markdown("### 🔍 **Search Models**")
search_box = gr.Textbox(
label="",
placeholder="Search for a model name (e.g., Llama, Qwen, DeepSeek)...",
value="",
elem_classes="search-input",
)
# Model count
total_models = gr.Markdown(f"**Showing {len(df)} models**")
# Results table below filters
results_table = gr.HTML(
value=create_html_table(
filter_and_search_models(
"",
["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
"Combined Score",
["llama", "deepseek", "qwen", "google", "mistral", "others"],
)
),
elem_id="leaderboard-table",
)
# Metric explanations at the bottom
with gr.Accordion("Metric Explanations", open=False):
gr.Markdown(
"""
- **Grounding Score**: Percentage of responses where all claims are supported by the context
- **Quality Score**: Percentage of responses that adequately address the user's request
- **Combined Score**: Percentage of responses that pass both quality and grounding checks
"""
)
with gr.TabItem("About"):
gr.Markdown(
"""
# About This Evaluation
## FACTS Grounding Leaderboard
The FACTS Grounding Leaderboard is a benchmark developed by Google DeepMind to evaluate how well Large Language Models (LLMs) can generate factually accurate responses that are fully grounded in provided context documents.
### How It Works:
1. **Input**: Each example contains a system instruction, a context document (up to 32k tokens), and a user request
2. **Task**: Models must generate responses that answer the user's request using ONLY information from the provided context
3. **Evaluation**: Responses are evaluated in two phases:
- **Quality Check**: Does the response adequately address the user's request?
- **Grounding Check**: Is every claim in the response supported by the context document?
## Medical Domain Variation
This implementation focuses specifically on medical domain examples from the FACTS benchmark to evaluate smaller, open-source models in healthcare contexts.
### Key Modifications:
- **Domain-Specific**: Uses only the 236 medical examples from the original 860-example dataset
- **Single Judge Model**: Employs Gemini 1.5 Flash as the sole evaluator (vs. the original's ensemble of 3 models)
- **Focus on Accessibility**: Tests Qwen 3 1.7B, demonstrating that smaller models can be benchmarked on this important task
- **Streamlined Process**: Simplified evaluation pipeline suitable for resource-constrained environments
### Why Medical Domain?
Medical information requires exceptional accuracy and grounding. By focusing on this domain, we can assess how well smaller models handle critical healthcare information while strictly adhering to provided sources—a crucial capability for safe medical AI applications.
### Evaluation Metrics:
- **Grounding Score**: Percentage of responses where all claims are supported by the context
- **Quality Score**: Percentage of responses that adequately address the user's request
- **Combined Score**: Percentage of responses that pass both quality and grounding checks
This focused approach enables rapid iteration and testing of smaller models on domain-specific factual grounding tasks.
---
## References
- **Original Leaderboard by Google**: [FACTS Grounding Benchmark Leaderboard](https://www.kaggle.com/benchmarks/google/facts-grounding/leaderboard)
- **Public Dataset**: [FACTS Grounding Examples Dataset](https://www.kaggle.com/datasets/deepmind/facts-grounding-examples/data)
- **Technical Documentation**: [FACTS Grounding Benchmark Starter Code](https://www.kaggle.com/code/andrewmingwang/facts-grounding-benchmark-starter-code/notebook)
---
"""
)
# Update table when filters change
def update_table(search, sizes, sort_by, arch_filters):
filtered_df = filter_and_search_models(search, sizes, sort_by, arch_filters)
model_count = f"**Showing {len(filtered_df)} models**"
return create_html_table(filtered_df), model_count
# Connect all inputs to the update function
search_box.change(
fn=update_table,
inputs=[search_box, size_checkboxes, sort_dropdown, architecture_checkboxes],
outputs=[results_table, total_models],
)
size_checkboxes.change(
fn=update_table,
inputs=[search_box, size_checkboxes, sort_dropdown, architecture_checkboxes],
outputs=[results_table, total_models],
)
sort_dropdown.change(
fn=update_table,
inputs=[search_box, size_checkboxes, sort_dropdown, architecture_checkboxes],
outputs=[results_table, total_models],
)
architecture_checkboxes.change(
fn=update_table,
inputs=[search_box, size_checkboxes, sort_dropdown, architecture_checkboxes],
outputs=[results_table, total_models],
)
# Add custom CSS for better styling
app.css = """
.leaderboard-container {
margin-top: 20px;
max-height: 600px;
overflow-y: auto;
border-radius: 8px;
border: 1px solid #e9ecef;
}
.leaderboard-table {
width: 100%;
border-collapse: collapse;
font-size: 14px;
background: white;
}
.leaderboard-table th {
background-color: #f8f9fa;
font-weight: 600;
padding: 12px 8px;
text-align: center;
border-bottom: 2px solid #dee2e6;
position: sticky;
top: 0;
z-index: 10;
}
.leaderboard-table th:first-child {
width: 60px;
}
.leaderboard-table td {
padding: 10px 8px;
border-bottom: 1px solid #f1f3f4;
}
.leaderboard-table tbody tr:hover {
background-color: #f8f9fa;
}
.rank-cell {
text-align: center;
font-weight: 600;
color: #444;
background-color: #f8f9fa;
width: 60px;
}
.model-cell {
font-weight: 500;
max-width: 400px;
word-wrap: break-word;
}
.model-link {
color: #0066cc !important;
text-decoration: none !important;
font-weight: 500 !important;
transition: all 0.2s ease !important;
border-bottom: 1px solid transparent !important;
}
.model-link:hover {
color: #0052a3 !important;
border-bottom: 1px solid #0066cc !important;
background-color: rgba(0, 102, 204, 0.05) !important;
padding: 2px 4px !important;
border-radius: 4px !important;
margin: -2px -4px !important;
}
.size-cell {
text-align: center;
font-weight: 500;
color: #666;
min-width: 60px;
}
.score-cell {
text-align: center;
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 13px;
}
/* Model family row styling */
.llama-row {
background-color: #fffbf0;
}
.llama-row:hover {
background-color: #fef7e0;
}
.deepseek-row {
background-color: #f0f8ff;
}
.deepseek-row:hover {
background-color: #e6f3ff;
}
.qwen-row {
background-color: #f5fff5;
}
.qwen-row:hover {
background-color: #eaffea;
}
.google-row {
background-color: #fff0f5;
}
.google-row:hover {
background-color: #ffe6f0;
}
.mistral-row {
background-color: #faf5ff;
}
.mistral-row:hover {
background-color: #f3e8ff;
}
.others-row {
background-color: #f8fafc;
}
.others-row:hover {
background-color: #f1f5f9;
}
.size-filter {
margin-top: 10px;
}
.size-filter > div {
display: flex !important;
flex-wrap: wrap !important;
gap: 8px !important;
align-items: center !important;
}
.size-filter label {
display: flex !important;
align-items: center !important;
background: #f8f9fa !important;
border: 2px solid #e9ecef !important;
border-radius: 8px !important;
padding: 8px 12px !important;
margin: 0 !important;
cursor: pointer !important;
transition: all 0.2s ease !important;
font-weight: 500 !important;
font-size: 14px !important;
color: #495057 !important;
min-width: 70px !important;
justify-content: center !important;
}
.size-filter label:hover {
background: #e9ecef !important;
border-color: #6c757d !important;
}
.size-filter input[type="checkbox"] {
display: none !important;
}
.size-filter input[type="checkbox"]:checked + span {
background: #0d6efd !important;
color: white !important;
border-color: #0d6efd !important;
}
.size-filter label:has(input[type="checkbox"]:checked) {
background: #0d6efd !important;
color: white !important;
border-color: #0d6efd !important;
box-shadow: 0 2px 4px rgba(13, 110, 253, 0.2) !important;
}
.architecture-filter {
margin-top: 10px;
}
.architecture-filter > div {
display: flex !important;
flex-wrap: wrap !important;
gap: 8px !important;
align-items: center !important;
}
.architecture-filter label {
display: flex !important;
align-items: center !important;
border-radius: 8px !important;
padding: 8px 12px !important;
margin: 0 !important;
cursor: pointer !important;
transition: all 0.2s ease !important;
font-weight: 500 !important;
font-size: 14px !important;
min-width: 140px !important;
justify-content: center !important;
border: 2px solid !important;
}
.architecture-filter label:hover {
transform: translateY(-1px);
box-shadow: 0 2px 8px rgba(0,0,0,0.1) !important;
}
.architecture-filter input[type="checkbox"] {
display: none !important;
}
/* Llama styling */
.architecture-filter label:nth-child(1) {
background: #fffbf0 !important;
border-color: #f7e6a3 !important;
color: #8b4513 !important;
}
.architecture-filter label:nth-child(1):has(input[type="checkbox"]:checked) {
background: #f4a261 !important;
border-color: #f4a261 !important;
color: white !important;
box-shadow: 0 2px 4px rgba(244, 162, 97, 0.3) !important;
}
/* DeepSeek styling */
.architecture-filter label:nth-child(2) {
background: #f0f8ff !important;
border-color: #b3d9ff !important;
color: #1e40af !important;
}
.architecture-filter label:nth-child(2):has(input[type="checkbox"]:checked) {
background: #3b82f6 !important;
border-color: #3b82f6 !important;
color: white !important;
box-shadow: 0 2px 4px rgba(59, 130, 246, 0.3) !important;
}
/* Qwen styling */
.architecture-filter label:nth-child(3) {
background: #f5fff5 !important;
border-color: #b3ffb3 !important;
color: #15803d !important;
}
.architecture-filter label:nth-child(3):has(input[type="checkbox"]:checked) {
background: #22c55e !important;
border-color: #22c55e !important;
color: white !important;
box-shadow: 0 2px 4px rgba(34, 197, 94, 0.3) !important;
}
/* Google styling */
.architecture-filter label:nth-child(4) {
background: #fff0f5 !important;
border-color: #ffb3d9 !important;
color: #be185d !important;
}
.architecture-filter label:nth-child(4):has(input[type="checkbox"]:checked) {
background: #ec4899 !important;
border-color: #ec4899 !important;
color: white !important;
box-shadow: 0 2px 4px rgba(236, 72, 153, 0.3) !important;
}
/* Mistral styling */
.architecture-filter label:nth-child(5) {
background: #faf5ff !important;
border-color: #d8b4fe !important;
color: #7c3aed !important;
}
.architecture-filter label:nth-child(5):has(input[type="checkbox"]:checked) {
background: #8b5cf6 !important;
border-color: #8b5cf6 !important;
color: white !important;
box-shadow: 0 2px 4px rgba(139, 92, 246, 0.3) !important;
}
/* Others styling */
.architecture-filter label:nth-child(6) {
background: #f8fafc !important;
border-color: #cbd5e1 !important;
color: #475569 !important;
}
.architecture-filter label:nth-child(6):has(input[type="checkbox"]:checked) {
background: #64748b !important;
border-color: #64748b !important;
color: white !important;
box-shadow: 0 2px 4px rgba(100, 116, 139, 0.3) !important;
}
/* Search and Filter Section Styling */
.search-input input {
border: 2px solid #e9ecef !important;
border-radius: 12px !important;
padding: 12px 16px !important;
font-size: 14px !important;
transition: all 0.3s ease !important;
background: linear-gradient(135deg, #f8f9fa 0%, #ffffff 100%) !important;
}
.search-input input:focus {
border-color: #6366f1 !important;
box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.1) !important;
background: white !important;
}
.search-input input::placeholder {
color: #6b7280 !important;
font-style: italic !important;
}
/* Modern Sort Dropdown Styling */
.sort-dropdown-modern label {
font-weight: 600 !important;
color: #374151 !important;
margin-bottom: 8px !important;
}
.sort-dropdown-modern .wrap {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
border-radius: 12px !important;
padding: 2px !important;
border: none !important;
}
.sort-dropdown-modern select {
background: white !important;
border: none !important;
border-radius: 10px !important;
padding: 12px 16px !important;
font-size: 14px !important;
font-weight: 500 !important;
color: #374151 !important;
cursor: pointer !important;
transition: all 0.3s ease !important;
box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important;
}
.sort-dropdown-modern select:hover {
box-shadow: 0 4px 8px rgba(0,0,0,0.15) !important;
transform: translateY(-1px) !important;
}
.sort-dropdown-modern select:focus {
outline: none !important;
box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.2) !important;
}
/* Section Headers */
h3 {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
-webkit-background-clip: text !important;
-webkit-text-fill-color: transparent !important;
background-clip: text !important;
margin-bottom: 12px !important;
}
/* Centered Architecture Section */
.centered-title {
text-align: center !important;
}
.centered-filter > div {
display: flex !important;
flex-wrap: wrap !important;
gap: 8px !important;
align-items: center !important;
justify-content: center !important;
}
.size-filter {
margin-top: 10px;
}
/* Dark Mode Specific Styles */
@media (prefers-color-scheme: dark) {
.leaderboard-table {
background: #1f2937 !important;
color: #f9fafb !important;
}
.leaderboard-table th {
background-color: #374151 !important;
color: #f9fafb !important;
border-bottom: 2px solid #4b5563 !important;
}
.leaderboard-table td {
color: #f9fafb !important;
border-bottom: 1px solid #374151 !important;
}
.leaderboard-table tbody tr:hover {
background-color: #374151 !important;
}
.rank-cell {
background-color: #374151 !important;
color: #f9fafb !important;
}
.model-cell {
color: #f9fafb !important;
}
.size-cell {
color: #d1d5db !important;
}
.score-cell {
color: #f9fafb !important;
}
/* Dark mode row colors with better contrast */
.llama-row {
background-color: rgba(245, 158, 11, 0.1) !important;
}
.llama-row:hover {
background-color: rgba(245, 158, 11, 0.2) !important;
}
.deepseek-row {
background-color: rgba(59, 130, 246, 0.1) !important;
}
.deepseek-row:hover {
background-color: rgba(59, 130, 246, 0.2) !important;
}
.qwen-row {
background-color: rgba(34, 197, 94, 0.1) !important;
}
.qwen-row:hover {
background-color: rgba(34, 197, 94, 0.2) !important;
}
.google-row {
background-color: rgba(236, 72, 153, 0.2) !important;
}
.google-row:hover {
background-color: rgba(236, 72, 153, 0.2) !important;
}
.mistral-row {
background-color: rgba(139, 92, 246, 0.1) !important;
}
.mistral-row:hover {
background-color: rgba(139, 92, 246, 0.2) !important;
}
.others-row {
background-color: rgba(107, 114, 128, 0.1) !important;
}
.others-row:hover {
background-color: rgba(107, 114, 128, 0.2) !important;
}
.leaderboard-container {
border: 1px solid #4b5563 !important;
}
.model-cell {
color: #f9fafb !important;
}
.model-link {
color: #60a5fa !important;
}
.model-link:hover {
color: #93c5fd !important;
border-bottom: 1px solid #60a5fa !important;
background-color: rgba(96, 165, 250, 0.1) !important;
}
.size-cell {
color: #d1d5db !important;
}
}
"""
# Launch the app
if __name__ == "__main__":
app.launch()