Commit
·
2139f5c
1
Parent(s):
09aab35
refactor filters and style
Browse files
app.py
CHANGED
@@ -37,7 +37,9 @@ def get_size_category(size):
|
|
37 |
df["Size_Category"] = df["Size"].apply(get_size_category)
|
38 |
|
39 |
|
40 |
-
def filter_and_search_models(
|
|
|
|
|
41 |
"""Filter and search models based on user inputs"""
|
42 |
filtered_df = df.copy()
|
43 |
|
@@ -52,6 +54,32 @@ def filter_and_search_models(search_query, size_ranges, sort_by):
|
|
52 |
if size_ranges and len(size_ranges) > 0:
|
53 |
filtered_df = filtered_df[filtered_df["Size_Category"].isin(size_ranges)]
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
# Sort by selected metric
|
56 |
if sort_by in filtered_df.columns:
|
57 |
filtered_df = filtered_df.sort_values(sort_by, ascending=False)
|
@@ -139,37 +167,60 @@ with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as a
|
|
139 |
|
140 |
with gr.Tabs():
|
141 |
with gr.TabItem("Leaderboard"):
|
142 |
-
#
|
143 |
with gr.Row():
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
)
|
150 |
|
151 |
-
|
152 |
-
|
|
|
153 |
choices=[
|
154 |
-
"
|
155 |
-
"
|
156 |
-
"
|
|
|
157 |
],
|
158 |
-
value="
|
159 |
-
label="
|
160 |
-
elem_classes="
|
|
|
161 |
)
|
162 |
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
)
|
173 |
|
174 |
# Model count
|
175 |
total_models = gr.Markdown(f"**Showing {len(df)} models**")
|
@@ -181,6 +232,7 @@ with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as a
|
|
181 |
"",
|
182 |
["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
|
183 |
"Combined Score",
|
|
|
184 |
)
|
185 |
),
|
186 |
elem_id="leaderboard-table",
|
@@ -219,7 +271,8 @@ with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as a
|
|
219 |
### Key Modifications:
|
220 |
- **Domain-Specific**: Uses only the 236 medical examples from the original 860-example dataset
|
221 |
- **Single Judge Model**: Employs Gemini 1.5 Flash as the sole evaluator (vs. the original's ensemble of 3 models)
|
222 |
-
- **Focus on
|
|
|
223 |
|
224 |
### Why Medical Domain?
|
225 |
Medical information requires exceptional accuracy and grounding. By focusing on this domain, we can assess how well smaller models handle critical healthcare information while strictly adhering to provided sources—a crucial capability for safe medical AI applications.
|
@@ -244,27 +297,33 @@ with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as a
|
|
244 |
)
|
245 |
|
246 |
# Update table when filters change
|
247 |
-
def update_table(search, sizes, sort_by):
|
248 |
-
filtered_df = filter_and_search_models(search, sizes, sort_by)
|
249 |
model_count = f"**Showing {len(filtered_df)} models**"
|
250 |
return create_html_table(filtered_df), model_count
|
251 |
|
252 |
# Connect all inputs to the update function
|
253 |
search_box.change(
|
254 |
fn=update_table,
|
255 |
-
inputs=[search_box, size_checkboxes, sort_dropdown],
|
256 |
outputs=[results_table, total_models],
|
257 |
)
|
258 |
|
259 |
size_checkboxes.change(
|
260 |
fn=update_table,
|
261 |
-
inputs=[search_box, size_checkboxes, sort_dropdown],
|
262 |
outputs=[results_table, total_models],
|
263 |
)
|
264 |
|
265 |
sort_dropdown.change(
|
266 |
fn=update_table,
|
267 |
-
inputs=[search_box, size_checkboxes, sort_dropdown],
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
outputs=[results_table, total_models],
|
269 |
)
|
270 |
|
@@ -418,6 +477,181 @@ with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as a
|
|
418 |
border-color: #0d6efd !important;
|
419 |
box-shadow: 0 2px 4px rgba(13, 110, 253, 0.2) !important;
|
420 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
"""
|
422 |
|
423 |
# Launch the app
|
|
|
37 |
df["Size_Category"] = df["Size"].apply(get_size_category)
|
38 |
|
39 |
|
40 |
+
def filter_and_search_models(
|
41 |
+
search_query, size_ranges, sort_by, architecture_filters=None
|
42 |
+
):
|
43 |
"""Filter and search models based on user inputs"""
|
44 |
filtered_df = df.copy()
|
45 |
|
|
|
54 |
if size_ranges and len(size_ranges) > 0:
|
55 |
filtered_df = filtered_df[filtered_df["Size_Category"].isin(size_ranges)]
|
56 |
|
57 |
+
# Apply architecture filter
|
58 |
+
if architecture_filters and len(architecture_filters) > 0:
|
59 |
+
architecture_mask = pd.Series(
|
60 |
+
[False] * len(filtered_df), index=filtered_df.index
|
61 |
+
)
|
62 |
+
|
63 |
+
for arch in architecture_filters:
|
64 |
+
if arch == "llama":
|
65 |
+
architecture_mask |= filtered_df["Model Name"].str.contains(
|
66 |
+
"meta-llama", case=False, na=False
|
67 |
+
)
|
68 |
+
elif arch == "deepseek":
|
69 |
+
architecture_mask |= filtered_df["Model Name"].str.contains(
|
70 |
+
"deepseek", case=False, na=False
|
71 |
+
)
|
72 |
+
elif arch == "qwen":
|
73 |
+
architecture_mask |= filtered_df["Model Name"].str.contains(
|
74 |
+
"Qwen", case=False, na=False
|
75 |
+
)
|
76 |
+
elif arch == "google":
|
77 |
+
architecture_mask |= filtered_df["Model Name"].str.contains(
|
78 |
+
"google", case=False, na=False
|
79 |
+
)
|
80 |
+
|
81 |
+
filtered_df = filtered_df[architecture_mask]
|
82 |
+
|
83 |
# Sort by selected metric
|
84 |
if sort_by in filtered_df.columns:
|
85 |
filtered_df = filtered_df.sort_values(sort_by, ascending=False)
|
|
|
167 |
|
168 |
with gr.Tabs():
|
169 |
with gr.TabItem("Leaderboard"):
|
170 |
+
# Top section with search and filters
|
171 |
with gr.Row():
|
172 |
+
# Left side - All Filters
|
173 |
+
with gr.Column(scale=1):
|
174 |
+
gr.Markdown("### 🎛️ **Filter & Sort Options**")
|
175 |
+
|
176 |
+
# Sort dropdown with modern styling
|
177 |
+
with gr.Row():
|
178 |
+
sort_dropdown = gr.Dropdown(
|
179 |
+
choices=[
|
180 |
+
("🏆 Combined Score", "Combined Score"),
|
181 |
+
("🎯 Grounding Score", "Separate Grounding Score"),
|
182 |
+
("📊 Quality Score", "Separate Quality Score"),
|
183 |
+
],
|
184 |
+
value="Combined Score",
|
185 |
+
label="Sort by Metric",
|
186 |
+
elem_classes="sort-dropdown-modern",
|
187 |
+
container=True,
|
188 |
+
)
|
189 |
+
|
190 |
+
# Size filters
|
191 |
+
gr.Markdown("**📏 Filter by Model Size:**")
|
192 |
+
size_checkboxes = gr.CheckboxGroup(
|
193 |
+
choices=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
|
194 |
+
value=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
|
195 |
+
label="",
|
196 |
+
elem_classes="size-filter",
|
197 |
+
container=False,
|
198 |
)
|
199 |
|
200 |
+
# Model architecture filters
|
201 |
+
gr.Markdown("**🏗️ Filter by Model Architecture:**")
|
202 |
+
architecture_checkboxes = gr.CheckboxGroup(
|
203 |
choices=[
|
204 |
+
("🤖 DeepSeek", "deepseek"),
|
205 |
+
("🐧 Qwen", "qwen"),
|
206 |
+
("🦙 Llama", "llama"),
|
207 |
+
("🔷 Gemma", "google"),
|
208 |
],
|
209 |
+
value=["llama", "deepseek", "qwen", "google"],
|
210 |
+
label="",
|
211 |
+
elem_classes="architecture-filter",
|
212 |
+
container=False,
|
213 |
)
|
214 |
|
215 |
+
# Right side - Search
|
216 |
+
with gr.Column(scale=1):
|
217 |
+
gr.Markdown("### 🔍 **Search Models**")
|
218 |
+
search_box = gr.Textbox(
|
219 |
+
label="",
|
220 |
+
placeholder="Search for a model name (e.g., Llama, Qwen, DeepSeek)...",
|
221 |
+
value="",
|
222 |
+
elem_classes="search-input",
|
223 |
+
)
|
|
|
224 |
|
225 |
# Model count
|
226 |
total_models = gr.Markdown(f"**Showing {len(df)} models**")
|
|
|
232 |
"",
|
233 |
["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
|
234 |
"Combined Score",
|
235 |
+
["llama", "deepseek", "qwen", "google"],
|
236 |
)
|
237 |
),
|
238 |
elem_id="leaderboard-table",
|
|
|
271 |
### Key Modifications:
|
272 |
- **Domain-Specific**: Uses only the 236 medical examples from the original 860-example dataset
|
273 |
- **Single Judge Model**: Employs Gemini 1.5 Flash as the sole evaluator (vs. the original's ensemble of 3 models)
|
274 |
+
- **Focus on Accessibility**: Tests Qwen 3 1.7B, demonstrating that smaller models can be benchmarked on this important task
|
275 |
+
- **Streamlined Process**: Simplified evaluation pipeline suitable for resource-constrained environments
|
276 |
|
277 |
### Why Medical Domain?
|
278 |
Medical information requires exceptional accuracy and grounding. By focusing on this domain, we can assess how well smaller models handle critical healthcare information while strictly adhering to provided sources—a crucial capability for safe medical AI applications.
|
|
|
297 |
)
|
298 |
|
299 |
# Update table when filters change
|
300 |
+
def update_table(search, sizes, sort_by, arch_filters):
|
301 |
+
filtered_df = filter_and_search_models(search, sizes, sort_by, arch_filters)
|
302 |
model_count = f"**Showing {len(filtered_df)} models**"
|
303 |
return create_html_table(filtered_df), model_count
|
304 |
|
305 |
# Connect all inputs to the update function
|
306 |
search_box.change(
|
307 |
fn=update_table,
|
308 |
+
inputs=[search_box, size_checkboxes, sort_dropdown, architecture_checkboxes],
|
309 |
outputs=[results_table, total_models],
|
310 |
)
|
311 |
|
312 |
size_checkboxes.change(
|
313 |
fn=update_table,
|
314 |
+
inputs=[search_box, size_checkboxes, sort_dropdown, architecture_checkboxes],
|
315 |
outputs=[results_table, total_models],
|
316 |
)
|
317 |
|
318 |
sort_dropdown.change(
|
319 |
fn=update_table,
|
320 |
+
inputs=[search_box, size_checkboxes, sort_dropdown, architecture_checkboxes],
|
321 |
+
outputs=[results_table, total_models],
|
322 |
+
)
|
323 |
+
|
324 |
+
architecture_checkboxes.change(
|
325 |
+
fn=update_table,
|
326 |
+
inputs=[search_box, size_checkboxes, sort_dropdown, architecture_checkboxes],
|
327 |
outputs=[results_table, total_models],
|
328 |
)
|
329 |
|
|
|
477 |
border-color: #0d6efd !important;
|
478 |
box-shadow: 0 2px 4px rgba(13, 110, 253, 0.2) !important;
|
479 |
}
|
480 |
+
|
481 |
+
.architecture-filter {
|
482 |
+
margin-top: 10px;
|
483 |
+
}
|
484 |
+
|
485 |
+
.architecture-filter > div {
|
486 |
+
display: flex !important;
|
487 |
+
flex-wrap: wrap !important;
|
488 |
+
gap: 8px !important;
|
489 |
+
align-items: center !important;
|
490 |
+
}
|
491 |
+
|
492 |
+
.architecture-filter label {
|
493 |
+
display: flex !important;
|
494 |
+
align-items: center !important;
|
495 |
+
border-radius: 8px !important;
|
496 |
+
padding: 8px 12px !important;
|
497 |
+
margin: 0 !important;
|
498 |
+
cursor: pointer !important;
|
499 |
+
transition: all 0.2s ease !important;
|
500 |
+
font-weight: 500 !important;
|
501 |
+
font-size: 14px !important;
|
502 |
+
min-width: 140px !important;
|
503 |
+
justify-content: center !important;
|
504 |
+
border: 2px solid !important;
|
505 |
+
}
|
506 |
+
|
507 |
+
.architecture-filter label:hover {
|
508 |
+
transform: translateY(-1px);
|
509 |
+
box-shadow: 0 2px 8px rgba(0,0,0,0.1) !important;
|
510 |
+
}
|
511 |
+
|
512 |
+
.architecture-filter input[type="checkbox"] {
|
513 |
+
display: none !important;
|
514 |
+
}
|
515 |
+
|
516 |
+
/* Llama styling */
|
517 |
+
.architecture-filter label:nth-child(1) {
|
518 |
+
background: #fffbf0 !important;
|
519 |
+
border-color: #f7e6a3 !important;
|
520 |
+
color: #8b4513 !important;
|
521 |
+
}
|
522 |
+
|
523 |
+
.architecture-filter label:nth-child(1):has(input[type="checkbox"]:checked) {
|
524 |
+
background: #f4a261 !important;
|
525 |
+
border-color: #f4a261 !important;
|
526 |
+
color: white !important;
|
527 |
+
box-shadow: 0 2px 4px rgba(244, 162, 97, 0.3) !important;
|
528 |
+
}
|
529 |
+
|
530 |
+
/* DeepSeek styling */
|
531 |
+
.architecture-filter label:nth-child(2) {
|
532 |
+
background: #f0f8ff !important;
|
533 |
+
border-color: #b3d9ff !important;
|
534 |
+
color: #1e40af !important;
|
535 |
+
}
|
536 |
+
|
537 |
+
.architecture-filter label:nth-child(2):has(input[type="checkbox"]:checked) {
|
538 |
+
background: #3b82f6 !important;
|
539 |
+
border-color: #3b82f6 !important;
|
540 |
+
color: white !important;
|
541 |
+
box-shadow: 0 2px 4px rgba(59, 130, 246, 0.3) !important;
|
542 |
+
}
|
543 |
+
|
544 |
+
/* Qwen styling */
|
545 |
+
.architecture-filter label:nth-child(3) {
|
546 |
+
background: #f5fff5 !important;
|
547 |
+
border-color: #b3ffb3 !important;
|
548 |
+
color: #15803d !important;
|
549 |
+
}
|
550 |
+
|
551 |
+
.architecture-filter label:nth-child(3):has(input[type="checkbox"]:checked) {
|
552 |
+
background: #22c55e !important;
|
553 |
+
border-color: #22c55e !important;
|
554 |
+
color: white !important;
|
555 |
+
box-shadow: 0 2px 4px rgba(34, 197, 94, 0.3) !important;
|
556 |
+
}
|
557 |
+
|
558 |
+
/* Google styling */
|
559 |
+
.architecture-filter label:nth-child(4) {
|
560 |
+
background: #fff0f5 !important;
|
561 |
+
border-color: #ffb3d9 !important;
|
562 |
+
color: #be185d !important;
|
563 |
+
}
|
564 |
+
|
565 |
+
.architecture-filter label:nth-child(4):has(input[type="checkbox"]:checked) {
|
566 |
+
background: #ec4899 !important;
|
567 |
+
border-color: #ec4899 !important;
|
568 |
+
color: white !important;
|
569 |
+
box-shadow: 0 2px 4px rgba(236, 72, 153, 0.3) !important;
|
570 |
+
}
|
571 |
+
|
572 |
+
/* Search and Filter Section Styling */
|
573 |
+
.search-input input {
|
574 |
+
border: 2px solid #e9ecef !important;
|
575 |
+
border-radius: 12px !important;
|
576 |
+
padding: 12px 16px !important;
|
577 |
+
font-size: 14px !important;
|
578 |
+
transition: all 0.3s ease !important;
|
579 |
+
background: linear-gradient(135deg, #f8f9fa 0%, #ffffff 100%) !important;
|
580 |
+
}
|
581 |
+
|
582 |
+
.search-input input:focus {
|
583 |
+
border-color: #6366f1 !important;
|
584 |
+
box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.1) !important;
|
585 |
+
background: white !important;
|
586 |
+
}
|
587 |
+
|
588 |
+
.search-input input::placeholder {
|
589 |
+
color: #6b7280 !important;
|
590 |
+
font-style: italic !important;
|
591 |
+
}
|
592 |
+
|
593 |
+
/* Modern Sort Dropdown Styling */
|
594 |
+
.sort-dropdown-modern label {
|
595 |
+
font-weight: 600 !important;
|
596 |
+
color: #374151 !important;
|
597 |
+
margin-bottom: 8px !important;
|
598 |
+
}
|
599 |
+
|
600 |
+
.sort-dropdown-modern .wrap {
|
601 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
|
602 |
+
border-radius: 12px !important;
|
603 |
+
padding: 2px !important;
|
604 |
+
border: none !important;
|
605 |
+
}
|
606 |
+
|
607 |
+
.sort-dropdown-modern select {
|
608 |
+
background: white !important;
|
609 |
+
border: none !important;
|
610 |
+
border-radius: 10px !important;
|
611 |
+
padding: 12px 16px !important;
|
612 |
+
font-size: 14px !important;
|
613 |
+
font-weight: 500 !important;
|
614 |
+
color: #374151 !important;
|
615 |
+
cursor: pointer !important;
|
616 |
+
transition: all 0.3s ease !important;
|
617 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important;
|
618 |
+
}
|
619 |
+
|
620 |
+
.sort-dropdown-modern select:hover {
|
621 |
+
box-shadow: 0 4px 8px rgba(0,0,0,0.15) !important;
|
622 |
+
transform: translateY(-1px) !important;
|
623 |
+
}
|
624 |
+
|
625 |
+
.sort-dropdown-modern select:focus {
|
626 |
+
outline: none !important;
|
627 |
+
box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.2) !important;
|
628 |
+
}
|
629 |
+
|
630 |
+
/* Section Headers */
|
631 |
+
h3 {
|
632 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
|
633 |
+
-webkit-background-clip: text !important;
|
634 |
+
-webkit-text-fill-color: transparent !important;
|
635 |
+
background-clip: text !important;
|
636 |
+
margin-bottom: 12px !important;
|
637 |
+
}
|
638 |
+
|
639 |
+
/* Centered Architecture Section */
|
640 |
+
.centered-title {
|
641 |
+
text-align: center !important;
|
642 |
+
}
|
643 |
+
|
644 |
+
.centered-filter > div {
|
645 |
+
display: flex !important;
|
646 |
+
flex-wrap: wrap !important;
|
647 |
+
gap: 8px !important;
|
648 |
+
align-items: center !important;
|
649 |
+
justify-content: center !important;
|
650 |
+
}
|
651 |
+
|
652 |
+
.size-filter {
|
653 |
+
margin-top: 10px;
|
654 |
+
}
|
655 |
"""
|
656 |
|
657 |
# Launch the app
|