MaziyarPanahi commited on
Commit
254bcd7
·
1 Parent(s): 1bc77fb
Files changed (1) hide show
  1. app.py +143 -86
app.py CHANGED
@@ -2,70 +2,105 @@ import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
 
5
- # Sample data structure - replace this with your actual CSV loading
6
- # df = pd.read_csv('your_leaderboard_data.csv')
7
-
8
- # For demonstration, I'll create sample data matching your structure
9
  data = {
10
- 'Model': ['Llama-3-70B-UltraMedical', 'MMed-Llama-3-8B', 'Llama-3.1-8B-UltraMedical', 'meditron-70b', 'meditron-7b'],
11
- 'Domain': ['Medical', 'Medical', 'Medical', 'Medical', 'Medical'],
12
- 'License': ['Llama-3', 'Llama-3', 'Llama-3', 'Apache 2.0', 'Apache 2.0'],
13
- 'Size (B)': [70, 8, 8, 70, 7],
14
- 'Size_Category': ['40-80', '5-10', '5-10', '40-80', '5-10'], # Added for filtering
15
- 'Accessibility': ['Open Source', 'Open Source', 'Open Source', 'Open Source', 'Open Source'],
16
- 'Average Performance': [33.4, 20.37, 20.16, 15.68, 9.52],
17
- 'ADE-Identification': [77.96, 83.15, 64.43, 51.8, 39.01],
18
- 'BrainMRI-AIS': [95.91, 91.82, 92.48, 87.27, 64.47],
19
- 'Brateca-Hospitalization': [59.23, 55.1, 35.42, 43.18, 40.1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  }
21
 
 
22
  df = pd.DataFrame(data)
23
 
24
- def filter_and_search_models(search_query, domain_filter, size_ranges, accessibility_filter):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  """Filter and search models based on user inputs"""
26
  filtered_df = df.copy()
27
 
28
  # Apply search filter
29
  if search_query:
30
- mask = filtered_df['Model'].str.contains(search_query, case=False, na=False)
31
  filtered_df = filtered_df[mask]
32
 
33
- # Apply domain filter
34
- if domain_filter:
35
- if domain_filter == "Medical":
36
- filtered_df = filtered_df[filtered_df['Domain'] == 'Medical']
37
- elif domain_filter == "General":
38
- filtered_df = filtered_df[filtered_df['Domain'] == 'General']
39
-
40
  # Apply size range filter
41
  if size_ranges and len(size_ranges) > 0:
42
  filtered_df = filtered_df[filtered_df['Size_Category'].isin(size_ranges)]
43
 
44
- # Apply accessibility filter
45
- if accessibility_filter:
46
- if accessibility_filter == "Open Source":
47
- filtered_df = filtered_df[filtered_df['Accessibility'] == 'Open Source']
48
- elif accessibility_filter == "Proprietary":
49
- filtered_df = filtered_df[filtered_df['Accessibility'] == 'Proprietary']
50
 
51
- # Sort by Average Performance (descending)
52
- filtered_df = filtered_df.sort_values('Average Performance', ascending=False)
53
-
54
- # Format the dataframe for display
55
- display_df = filtered_df[['Model', 'Domain', 'License', 'Size (B)',
56
- 'Average Performance', 'ADE-Identification',
57
- 'BrainMRI-AIS', 'Brateca-Hospitalization']]
58
 
59
  # Round numerical values for better display
60
- for col in ['Average Performance', 'ADE-Identification', 'BrainMRI-AIS', 'Brateca-Hospitalization']:
61
- display_df[col] = display_df[col].round(2)
62
 
63
  return display_df
64
 
65
  # Create the Gradio interface
66
  with gr.Blocks(title="FACT Leaderboard", theme=gr.themes.Base()) as app:
67
  gr.Markdown("# 🏆 FACT Leaderboard")
68
- gr.Markdown("### Filter and search medical AI models by performance metrics")
69
 
70
  with gr.Row():
71
  with gr.Column(scale=1):
@@ -76,75 +111,71 @@ with gr.Blocks(title="FACT Leaderboard", theme=gr.themes.Base()) as app:
76
  value=""
77
  )
78
 
79
- # Domain filter
80
- gr.Markdown("**Filter Model: Domain**")
81
- domain_radio = gr.Radio(
82
- choices=["All", "General", "Medical"],
83
- value="All",
84
- label="",
85
- elem_classes="domain-filter"
86
- )
87
-
88
  # Size range filter
89
- gr.Markdown("**Filter Model: Size Range**")
90
  size_checkboxes = gr.CheckboxGroup(
91
- choices=["0-5", "5-10", "10-40", "40-80", ">80"],
92
- value=["0-5", "5-10", "10-40", "40-80", ">80"],
93
  label="",
94
  elem_classes="size-filter"
95
  )
96
 
97
- # Accessibility filter
98
- gr.Markdown("**Filter Model: Accessibility**")
99
- accessibility_radio = gr.Radio(
100
- choices=["All", "Open Source", "Proprietary"],
101
- value="All",
102
  label="",
103
- elem_classes="accessibility-filter"
104
  )
 
 
 
 
 
 
 
 
 
105
 
106
  with gr.Column(scale=3):
107
  # Results table
108
  results_table = gr.Dataframe(
109
- value=filter_and_search_models("", "All", ["0-5", "5-10", "10-40", "40-80", ">80"], "All"),
110
- headers=["Model", "Model: Domain", "Model: License", "Size (B)",
111
- "Average Performance", "ADE-Identification",
112
- "BrainMRI-AIS", "Brateca-Hospitalization"],
113
- datatype=["str", "str", "str", "number", "number", "number", "number", "number"],
114
  elem_id="leaderboard-table",
115
  interactive=False,
116
  wrap=True
117
  )
 
 
 
118
 
119
  # Update table when filters change
120
- def update_table(search, domain, sizes, accessibility):
121
- domain_val = None if domain == "All" else domain
122
- accessibility_val = None if accessibility == "All" else accessibility
123
- return filter_and_search_models(search, domain_val, sizes, accessibility_val)
124
 
125
  # Connect all inputs to the update function
126
  search_box.change(
127
  fn=update_table,
128
- inputs=[search_box, domain_radio, size_checkboxes, accessibility_radio],
129
- outputs=results_table
130
- )
131
-
132
- domain_radio.change(
133
- fn=update_table,
134
- inputs=[search_box, domain_radio, size_checkboxes, accessibility_radio],
135
- outputs=results_table
136
  )
137
 
138
  size_checkboxes.change(
139
  fn=update_table,
140
- inputs=[search_box, domain_radio, size_checkboxes, accessibility_radio],
141
- outputs=results_table
142
  )
143
 
144
- accessibility_radio.change(
145
  fn=update_table,
146
- inputs=[search_box, domain_radio, size_checkboxes, accessibility_radio],
147
- outputs=results_table
148
  )
149
 
150
  # Add custom CSS for better styling
@@ -153,24 +184,50 @@ with gr.Blocks(title="FACT Leaderboard", theme=gr.themes.Base()) as app:
153
  font-size: 14px;
154
  }
155
 
156
- .domain-filter label,
157
- .size-filter label,
158
- .accessibility-filter label {
 
 
 
 
 
 
159
  display: flex;
160
  align-items: center;
161
  margin: 5px 0;
162
  }
163
 
164
- .domain-filter input[type="radio"],
165
- .accessibility-filter input[type="radio"] {
166
  margin-right: 8px;
167
  }
168
 
169
- .size-filter input[type="checkbox"] {
170
- margin-right: 8px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  }
172
  """
173
 
 
 
 
 
174
  # Launch the app
175
  if __name__ == "__main__":
176
  app.launch(share=True)
 
2
  import pandas as pd
3
  import numpy as np
4
 
5
+ # Sample data based on your CSV structure
 
 
 
6
  data = {
7
+ 'Model Name': [
8
+ 'deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
9
+ 'VIDraft/Gemma-3-R1984-27B',
10
+ 'meta-llama/Llama-3.3-70B-Instruct',
11
+ 'Qwen/Qwen3-30B-A3B',
12
+ 'Qwen/Qwen3-4B',
13
+ 'Qwen/Qwen3-32B',
14
+ 'deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
15
+ 'Qwen/Qwen3-8B',
16
+ 'Qwen/Qwen3-14B',
17
+ 'google/gemma-3-27b-it',
18
+ 'Qwen/Qwen2.5-VL-32B-Instruct',
19
+ 'meta-llama/Llama-3.1-70B-Instruct',
20
+ 'google/gemma-3-12b-it',
21
+ 'google/gemma-3-4b-it',
22
+ 'Qwen/Qwen3-1.7B'
23
+ ],
24
+ 'Separate Grounding Score': [
25
+ 0.817797, 0.93617, 0.842553, 0.812766, 0.770213, 0.740426,
26
+ 0.766949, 0.748936, 0.778723, 0.936, 0.621277, 0.855932,
27
+ 0.944, 0.9, 0.702128
28
+ ],
29
+ 'Separate Quality Score': [
30
+ 0.542373, 0.459574, 0.510638, 0.540426, 0.540426, 0.553191,
31
+ 0.516949, 0.523404, 0.502128, 0.391, 0.570213, 0.389831,
32
+ 0.343, 0.33, 0.451064
33
+ ],
34
+ 'Combined Score': [
35
+ 0.457627, 0.434043, 0.425532, 0.425532, 0.425532, 0.417021,
36
+ 0.40678, 0.4, 0.382979, 0.378, 0.357447, 0.334746,
37
+ 0.313, 0.3, 0.297872
38
+ ]
39
  }
40
 
41
+ # Create DataFrame
42
  df = pd.DataFrame(data)
43
 
44
+ # Extract size from model name for filtering
45
+ def extract_size(model_name):
46
+ """Extract size from model name (e.g., '14B' -> 14)"""
47
+ import re
48
+ # Look for patterns like 14B, 1.7B, 70B, etc.
49
+ match = re.search(r'(\d+\.?\d*)B', model_name)
50
+ if match:
51
+ return float(match.group(1))
52
+ return 0
53
+
54
+ df['Size'] = df['Model Name'].apply(extract_size)
55
+
56
+ # Add size category for filtering
57
+ def get_size_category(size):
58
+ if size <= 5:
59
+ return "0-5B"
60
+ elif size <= 10:
61
+ return "5-10B"
62
+ elif size <= 20:
63
+ return "10-20B"
64
+ elif size <= 40:
65
+ return "20-40B"
66
+ elif size <= 80:
67
+ return "40-80B"
68
+ else:
69
+ return ">80B"
70
+
71
+ df['Size_Category'] = df['Size'].apply(get_size_category)
72
+
73
+ def filter_and_search_models(search_query, size_ranges, sort_by):
74
  """Filter and search models based on user inputs"""
75
  filtered_df = df.copy()
76
 
77
  # Apply search filter
78
  if search_query:
79
+ mask = filtered_df['Model Name'].str.contains(search_query, case=False, na=False)
80
  filtered_df = filtered_df[mask]
81
 
 
 
 
 
 
 
 
82
  # Apply size range filter
83
  if size_ranges and len(size_ranges) > 0:
84
  filtered_df = filtered_df[filtered_df['Size_Category'].isin(size_ranges)]
85
 
86
+ # Sort by selected metric
87
+ if sort_by in filtered_df.columns:
88
+ filtered_df = filtered_df.sort_values(sort_by, ascending=False)
 
 
 
89
 
90
+ # Select only the columns to display
91
+ display_df = filtered_df[['Model Name', 'Separate Grounding Score',
92
+ 'Separate Quality Score', 'Combined Score']]
 
 
 
 
93
 
94
  # Round numerical values for better display
95
+ for col in ['Separate Grounding Score', 'Separate Quality Score', 'Combined Score']:
96
+ display_df.loc[:, col] = display_df[col].round(6)
97
 
98
  return display_df
99
 
100
  # Create the Gradio interface
101
  with gr.Blocks(title="FACT Leaderboard", theme=gr.themes.Base()) as app:
102
  gr.Markdown("# 🏆 FACT Leaderboard")
103
+ gr.Markdown("### Benchmark for evaluating factuality in language models")
104
 
105
  with gr.Row():
106
  with gr.Column(scale=1):
 
111
  value=""
112
  )
113
 
 
 
 
 
 
 
 
 
 
114
  # Size range filter
115
+ gr.Markdown("**Filter by Model Size**")
116
  size_checkboxes = gr.CheckboxGroup(
117
+ choices=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
118
+ value=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
119
  label="",
120
  elem_classes="size-filter"
121
  )
122
 
123
+ # Sort by dropdown
124
+ gr.Markdown("**Sort by Metric**")
125
+ sort_dropdown = gr.Dropdown(
126
+ choices=["Combined Score", "Separate Grounding Score", "Separate Quality Score"],
127
+ value="Combined Score",
128
  label="",
129
+ elem_classes="sort-dropdown"
130
  )
131
+
132
+ # Add legend/explanation
133
+ gr.Markdown("---")
134
+ gr.Markdown("**Metric Explanations:**")
135
+ gr.Markdown("""
136
+ - **Grounding Score**: Measures factual accuracy
137
+ - **Quality Score**: Measures response quality
138
+ - **Combined Score**: Overall performance metric
139
+ """)
140
 
141
  with gr.Column(scale=3):
142
  # Results table
143
  results_table = gr.Dataframe(
144
+ value=filter_and_search_models("", ["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"], "Combined Score"),
145
+ headers=["Model Name", "Separate Grounding Score",
146
+ "Separate Quality Score", "Combined Score"],
147
+ datatype=["str", "number", "number", "number"],
 
148
  elem_id="leaderboard-table",
149
  interactive=False,
150
  wrap=True
151
  )
152
+
153
+ # Add statistics
154
+ total_models = gr.Markdown(f"**Total Models: {len(df)}**")
155
 
156
  # Update table when filters change
157
+ def update_table(search, sizes, sort_by):
158
+ filtered_df = filter_and_search_models(search, sizes, sort_by)
159
+ model_count = f"**Total Models: {len(filtered_df)}**"
160
+ return filtered_df, model_count
161
 
162
  # Connect all inputs to the update function
163
  search_box.change(
164
  fn=update_table,
165
+ inputs=[search_box, size_checkboxes, sort_dropdown],
166
+ outputs=[results_table, total_models]
 
 
 
 
 
 
167
  )
168
 
169
  size_checkboxes.change(
170
  fn=update_table,
171
+ inputs=[search_box, size_checkboxes, sort_dropdown],
172
+ outputs=[results_table, total_models]
173
  )
174
 
175
+ sort_dropdown.change(
176
  fn=update_table,
177
+ inputs=[search_box, size_checkboxes, sort_dropdown],
178
+ outputs=[results_table, total_models]
179
  )
180
 
181
  # Add custom CSS for better styling
 
184
  font-size: 14px;
185
  }
186
 
187
+ #leaderboard-table td:first-child {
188
+ font-weight: 500;
189
+ }
190
+
191
+ #leaderboard-table td:not(:first-child) {
192
+ text-align: center;
193
+ }
194
+
195
+ .size-filter label {
196
  display: flex;
197
  align-items: center;
198
  margin: 5px 0;
199
  }
200
 
201
+ .size-filter input[type="checkbox"] {
 
202
  margin-right: 8px;
203
  }
204
 
205
+ .sort-dropdown {
206
+ margin-top: 10px;
207
+ }
208
+
209
+ /* Highlight rows based on model family */
210
+ #leaderboard-table tr:has(td:contains("meta-llama")) {
211
+ background-color: #fffbf0;
212
+ }
213
+
214
+ #leaderboard-table tr:has(td:contains("deepseek")) {
215
+ background-color: #f0f8ff;
216
+ }
217
+
218
+ #leaderboard-table tr:has(td:contains("Qwen")) {
219
+ background-color: #f0fff0;
220
+ }
221
+
222
+ #leaderboard-table tr:has(td:contains("google")) {
223
+ background-color: #fff0f5;
224
  }
225
  """
226
 
227
+ # To load from CSV file, replace the sample data with:
228
+ # df = pd.read_csv('your_fact_leaderboard.csv')
229
+ # Then add the Size extraction and Size_Category as shown above
230
+
231
  # Launch the app
232
  if __name__ == "__main__":
233
  app.launch(share=True)