MaziyarPanahi commited on
Commit
2139f5c
·
1 Parent(s): 09aab35

refactor filters and style

Browse files
Files changed (1) hide show
  1. app.py +265 -31
app.py CHANGED
@@ -37,7 +37,9 @@ def get_size_category(size):
37
  df["Size_Category"] = df["Size"].apply(get_size_category)
38
 
39
 
40
- def filter_and_search_models(search_query, size_ranges, sort_by):
 
 
41
  """Filter and search models based on user inputs"""
42
  filtered_df = df.copy()
43
 
@@ -52,6 +54,32 @@ def filter_and_search_models(search_query, size_ranges, sort_by):
52
  if size_ranges and len(size_ranges) > 0:
53
  filtered_df = filtered_df[filtered_df["Size_Category"].isin(size_ranges)]
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  # Sort by selected metric
56
  if sort_by in filtered_df.columns:
57
  filtered_df = filtered_df.sort_values(sort_by, ascending=False)
@@ -139,37 +167,60 @@ with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as a
139
 
140
  with gr.Tabs():
141
  with gr.TabItem("Leaderboard"):
142
- # Filters at the top
143
  with gr.Row():
144
- with gr.Column(scale=2):
145
- search_box = gr.Textbox(
146
- label="Model Search",
147
- placeholder="Search for a model name...",
148
- value="",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  )
150
 
151
- with gr.Column(scale=1):
152
- sort_dropdown = gr.Dropdown(
 
153
  choices=[
154
- "Combined Score",
155
- "Separate Grounding Score",
156
- "Separate Quality Score",
 
157
  ],
158
- value="Combined Score",
159
- label="Sort by",
160
- elem_classes="sort-dropdown",
 
161
  )
162
 
163
- # Size filters in a row
164
- with gr.Row():
165
- gr.Markdown("**Filter by Model Size:**")
166
- size_checkboxes = gr.CheckboxGroup(
167
- choices=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
168
- value=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
169
- label="",
170
- elem_classes="size-filter",
171
- container=False,
172
- )
173
 
174
  # Model count
175
  total_models = gr.Markdown(f"**Showing {len(df)} models**")
@@ -181,6 +232,7 @@ with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as a
181
  "",
182
  ["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
183
  "Combined Score",
 
184
  )
185
  ),
186
  elem_id="leaderboard-table",
@@ -219,7 +271,8 @@ with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as a
219
  ### Key Modifications:
220
  - **Domain-Specific**: Uses only the 236 medical examples from the original 860-example dataset
221
  - **Single Judge Model**: Employs Gemini 1.5 Flash as the sole evaluator (vs. the original's ensemble of 3 models)
222
- - **Focus on Open Models**: Evaluates open-source models often missing from mainstream leaderboards for medical domain
 
223
 
224
  ### Why Medical Domain?
225
  Medical information requires exceptional accuracy and grounding. By focusing on this domain, we can assess how well smaller models handle critical healthcare information while strictly adhering to provided sources—a crucial capability for safe medical AI applications.
@@ -244,27 +297,33 @@ with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as a
244
  )
245
 
246
  # Update table when filters change
247
- def update_table(search, sizes, sort_by):
248
- filtered_df = filter_and_search_models(search, sizes, sort_by)
249
  model_count = f"**Showing {len(filtered_df)} models**"
250
  return create_html_table(filtered_df), model_count
251
 
252
  # Connect all inputs to the update function
253
  search_box.change(
254
  fn=update_table,
255
- inputs=[search_box, size_checkboxes, sort_dropdown],
256
  outputs=[results_table, total_models],
257
  )
258
 
259
  size_checkboxes.change(
260
  fn=update_table,
261
- inputs=[search_box, size_checkboxes, sort_dropdown],
262
  outputs=[results_table, total_models],
263
  )
264
 
265
  sort_dropdown.change(
266
  fn=update_table,
267
- inputs=[search_box, size_checkboxes, sort_dropdown],
 
 
 
 
 
 
268
  outputs=[results_table, total_models],
269
  )
270
 
@@ -418,6 +477,181 @@ with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as a
418
  border-color: #0d6efd !important;
419
  box-shadow: 0 2px 4px rgba(13, 110, 253, 0.2) !important;
420
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  """
422
 
423
  # Launch the app
 
37
  df["Size_Category"] = df["Size"].apply(get_size_category)
38
 
39
 
40
+ def filter_and_search_models(
41
+ search_query, size_ranges, sort_by, architecture_filters=None
42
+ ):
43
  """Filter and search models based on user inputs"""
44
  filtered_df = df.copy()
45
 
 
54
  if size_ranges and len(size_ranges) > 0:
55
  filtered_df = filtered_df[filtered_df["Size_Category"].isin(size_ranges)]
56
 
57
+ # Apply architecture filter
58
+ if architecture_filters and len(architecture_filters) > 0:
59
+ architecture_mask = pd.Series(
60
+ [False] * len(filtered_df), index=filtered_df.index
61
+ )
62
+
63
+ for arch in architecture_filters:
64
+ if arch == "llama":
65
+ architecture_mask |= filtered_df["Model Name"].str.contains(
66
+ "meta-llama", case=False, na=False
67
+ )
68
+ elif arch == "deepseek":
69
+ architecture_mask |= filtered_df["Model Name"].str.contains(
70
+ "deepseek", case=False, na=False
71
+ )
72
+ elif arch == "qwen":
73
+ architecture_mask |= filtered_df["Model Name"].str.contains(
74
+ "Qwen", case=False, na=False
75
+ )
76
+ elif arch == "google":
77
+ architecture_mask |= filtered_df["Model Name"].str.contains(
78
+ "google", case=False, na=False
79
+ )
80
+
81
+ filtered_df = filtered_df[architecture_mask]
82
+
83
  # Sort by selected metric
84
  if sort_by in filtered_df.columns:
85
  filtered_df = filtered_df.sort_values(sort_by, ascending=False)
 
167
 
168
  with gr.Tabs():
169
  with gr.TabItem("Leaderboard"):
170
+ # Top section with search and filters
171
  with gr.Row():
172
+ # Left side - All Filters
173
+ with gr.Column(scale=1):
174
+ gr.Markdown("### 🎛️ **Filter & Sort Options**")
175
+
176
+ # Sort dropdown with modern styling
177
+ with gr.Row():
178
+ sort_dropdown = gr.Dropdown(
179
+ choices=[
180
+ ("🏆 Combined Score", "Combined Score"),
181
+ ("🎯 Grounding Score", "Separate Grounding Score"),
182
+ ("📊 Quality Score", "Separate Quality Score"),
183
+ ],
184
+ value="Combined Score",
185
+ label="Sort by Metric",
186
+ elem_classes="sort-dropdown-modern",
187
+ container=True,
188
+ )
189
+
190
+ # Size filters
191
+ gr.Markdown("**📏 Filter by Model Size:**")
192
+ size_checkboxes = gr.CheckboxGroup(
193
+ choices=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
194
+ value=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
195
+ label="",
196
+ elem_classes="size-filter",
197
+ container=False,
198
  )
199
 
200
+ # Model architecture filters
201
+ gr.Markdown("**🏗️ Filter by Model Architecture:**")
202
+ architecture_checkboxes = gr.CheckboxGroup(
203
  choices=[
204
+ ("🤖 DeepSeek", "deepseek"),
205
+ ("🐧 Qwen", "qwen"),
206
+ ("🦙 Llama", "llama"),
207
+ ("🔷 Gemma", "google"),
208
  ],
209
+ value=["llama", "deepseek", "qwen", "google"],
210
+ label="",
211
+ elem_classes="architecture-filter",
212
+ container=False,
213
  )
214
 
215
+ # Right side - Search
216
+ with gr.Column(scale=1):
217
+ gr.Markdown("### 🔍 **Search Models**")
218
+ search_box = gr.Textbox(
219
+ label="",
220
+ placeholder="Search for a model name (e.g., Llama, Qwen, DeepSeek)...",
221
+ value="",
222
+ elem_classes="search-input",
223
+ )
 
224
 
225
  # Model count
226
  total_models = gr.Markdown(f"**Showing {len(df)} models**")
 
232
  "",
233
  ["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
234
  "Combined Score",
235
+ ["llama", "deepseek", "qwen", "google"],
236
  )
237
  ),
238
  elem_id="leaderboard-table",
 
271
  ### Key Modifications:
272
  - **Domain-Specific**: Uses only the 236 medical examples from the original 860-example dataset
273
  - **Single Judge Model**: Employs Gemini 1.5 Flash as the sole evaluator (vs. the original's ensemble of 3 models)
274
+ - **Focus on Accessibility**: Tests Qwen 3 1.7B, demonstrating that smaller models can be benchmarked on this important task
275
+ - **Streamlined Process**: Simplified evaluation pipeline suitable for resource-constrained environments
276
 
277
  ### Why Medical Domain?
278
  Medical information requires exceptional accuracy and grounding. By focusing on this domain, we can assess how well smaller models handle critical healthcare information while strictly adhering to provided sources—a crucial capability for safe medical AI applications.
 
297
  )
298
 
299
  # Update table when filters change
300
+ def update_table(search, sizes, sort_by, arch_filters):
301
+ filtered_df = filter_and_search_models(search, sizes, sort_by, arch_filters)
302
  model_count = f"**Showing {len(filtered_df)} models**"
303
  return create_html_table(filtered_df), model_count
304
 
305
  # Connect all inputs to the update function
306
  search_box.change(
307
  fn=update_table,
308
+ inputs=[search_box, size_checkboxes, sort_dropdown, architecture_checkboxes],
309
  outputs=[results_table, total_models],
310
  )
311
 
312
  size_checkboxes.change(
313
  fn=update_table,
314
+ inputs=[search_box, size_checkboxes, sort_dropdown, architecture_checkboxes],
315
  outputs=[results_table, total_models],
316
  )
317
 
318
  sort_dropdown.change(
319
  fn=update_table,
320
+ inputs=[search_box, size_checkboxes, sort_dropdown, architecture_checkboxes],
321
+ outputs=[results_table, total_models],
322
+ )
323
+
324
+ architecture_checkboxes.change(
325
+ fn=update_table,
326
+ inputs=[search_box, size_checkboxes, sort_dropdown, architecture_checkboxes],
327
  outputs=[results_table, total_models],
328
  )
329
 
 
477
  border-color: #0d6efd !important;
478
  box-shadow: 0 2px 4px rgba(13, 110, 253, 0.2) !important;
479
  }
480
+
481
+ .architecture-filter {
482
+ margin-top: 10px;
483
+ }
484
+
485
+ .architecture-filter > div {
486
+ display: flex !important;
487
+ flex-wrap: wrap !important;
488
+ gap: 8px !important;
489
+ align-items: center !important;
490
+ }
491
+
492
+ .architecture-filter label {
493
+ display: flex !important;
494
+ align-items: center !important;
495
+ border-radius: 8px !important;
496
+ padding: 8px 12px !important;
497
+ margin: 0 !important;
498
+ cursor: pointer !important;
499
+ transition: all 0.2s ease !important;
500
+ font-weight: 500 !important;
501
+ font-size: 14px !important;
502
+ min-width: 140px !important;
503
+ justify-content: center !important;
504
+ border: 2px solid !important;
505
+ }
506
+
507
+ .architecture-filter label:hover {
508
+ transform: translateY(-1px);
509
+ box-shadow: 0 2px 8px rgba(0,0,0,0.1) !important;
510
+ }
511
+
512
+ .architecture-filter input[type="checkbox"] {
513
+ display: none !important;
514
+ }
515
+
516
+ /* Llama styling */
517
+ .architecture-filter label:nth-child(1) {
518
+ background: #fffbf0 !important;
519
+ border-color: #f7e6a3 !important;
520
+ color: #8b4513 !important;
521
+ }
522
+
523
+ .architecture-filter label:nth-child(1):has(input[type="checkbox"]:checked) {
524
+ background: #f4a261 !important;
525
+ border-color: #f4a261 !important;
526
+ color: white !important;
527
+ box-shadow: 0 2px 4px rgba(244, 162, 97, 0.3) !important;
528
+ }
529
+
530
+ /* DeepSeek styling */
531
+ .architecture-filter label:nth-child(2) {
532
+ background: #f0f8ff !important;
533
+ border-color: #b3d9ff !important;
534
+ color: #1e40af !important;
535
+ }
536
+
537
+ .architecture-filter label:nth-child(2):has(input[type="checkbox"]:checked) {
538
+ background: #3b82f6 !important;
539
+ border-color: #3b82f6 !important;
540
+ color: white !important;
541
+ box-shadow: 0 2px 4px rgba(59, 130, 246, 0.3) !important;
542
+ }
543
+
544
+ /* Qwen styling */
545
+ .architecture-filter label:nth-child(3) {
546
+ background: #f5fff5 !important;
547
+ border-color: #b3ffb3 !important;
548
+ color: #15803d !important;
549
+ }
550
+
551
+ .architecture-filter label:nth-child(3):has(input[type="checkbox"]:checked) {
552
+ background: #22c55e !important;
553
+ border-color: #22c55e !important;
554
+ color: white !important;
555
+ box-shadow: 0 2px 4px rgba(34, 197, 94, 0.3) !important;
556
+ }
557
+
558
+ /* Google styling */
559
+ .architecture-filter label:nth-child(4) {
560
+ background: #fff0f5 !important;
561
+ border-color: #ffb3d9 !important;
562
+ color: #be185d !important;
563
+ }
564
+
565
+ .architecture-filter label:nth-child(4):has(input[type="checkbox"]:checked) {
566
+ background: #ec4899 !important;
567
+ border-color: #ec4899 !important;
568
+ color: white !important;
569
+ box-shadow: 0 2px 4px rgba(236, 72, 153, 0.3) !important;
570
+ }
571
+
572
+ /* Search and Filter Section Styling */
573
+ .search-input input {
574
+ border: 2px solid #e9ecef !important;
575
+ border-radius: 12px !important;
576
+ padding: 12px 16px !important;
577
+ font-size: 14px !important;
578
+ transition: all 0.3s ease !important;
579
+ background: linear-gradient(135deg, #f8f9fa 0%, #ffffff 100%) !important;
580
+ }
581
+
582
+ .search-input input:focus {
583
+ border-color: #6366f1 !important;
584
+ box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.1) !important;
585
+ background: white !important;
586
+ }
587
+
588
+ .search-input input::placeholder {
589
+ color: #6b7280 !important;
590
+ font-style: italic !important;
591
+ }
592
+
593
+ /* Modern Sort Dropdown Styling */
594
+ .sort-dropdown-modern label {
595
+ font-weight: 600 !important;
596
+ color: #374151 !important;
597
+ margin-bottom: 8px !important;
598
+ }
599
+
600
+ .sort-dropdown-modern .wrap {
601
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
602
+ border-radius: 12px !important;
603
+ padding: 2px !important;
604
+ border: none !important;
605
+ }
606
+
607
+ .sort-dropdown-modern select {
608
+ background: white !important;
609
+ border: none !important;
610
+ border-radius: 10px !important;
611
+ padding: 12px 16px !important;
612
+ font-size: 14px !important;
613
+ font-weight: 500 !important;
614
+ color: #374151 !important;
615
+ cursor: pointer !important;
616
+ transition: all 0.3s ease !important;
617
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important;
618
+ }
619
+
620
+ .sort-dropdown-modern select:hover {
621
+ box-shadow: 0 4px 8px rgba(0,0,0,0.15) !important;
622
+ transform: translateY(-1px) !important;
623
+ }
624
+
625
+ .sort-dropdown-modern select:focus {
626
+ outline: none !important;
627
+ box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.2) !important;
628
+ }
629
+
630
+ /* Section Headers */
631
+ h3 {
632
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
633
+ -webkit-background-clip: text !important;
634
+ -webkit-text-fill-color: transparent !important;
635
+ background-clip: text !important;
636
+ margin-bottom: 12px !important;
637
+ }
638
+
639
+ /* Centered Architecture Section */
640
+ .centered-title {
641
+ text-align: center !important;
642
+ }
643
+
644
+ .centered-filter > div {
645
+ display: flex !important;
646
+ flex-wrap: wrap !important;
647
+ gap: 8px !important;
648
+ align-items: center !important;
649
+ justify-content: center !important;
650
+ }
651
+
652
+ .size-filter {
653
+ margin-top: 10px;
654
+ }
655
  """
656
 
657
  # Launch the app