Bagratuni commited on
Commit
946a71d
·
1 Parent(s): deb84cd

add new models evaluation on unified exams

Browse files
Files changed (1) hide show
  1. model_results.json +805 -626
model_results.json CHANGED
@@ -1,645 +1,824 @@
1
  [
2
- {
3
- "model_name": "claude-3-7-sonnet-20250219",
4
- "results": {
5
- "mmlu_results": [],
6
- "unified_exam_results": [
7
- {
8
- "category": "Average",
9
- "score": 11.0833
10
- },
11
- {
12
- "category": "Armenian language and literature",
13
- "score": 10.5
14
- },
15
- {
16
- "category": "Armenian history",
17
- "score": 7.75
18
- },
19
- {
20
- "category": "Mathematics",
21
- "score": 15.0
22
- }
23
- ]
24
  }
 
25
  },
26
- {
27
- "model_name": "claude-3-5-sonnet-20241022",
28
- "results": {
29
- "mmlu_results": [
30
- {
31
- "category": "Average",
32
- "score": 0.6958
33
- },
34
- {
35
- "category": "Biology",
36
- "score": 0.8667
37
- },
38
- {
39
- "category": "Business",
40
- "score": 0.803
41
- },
42
- {
43
- "category": "Chemistry",
44
- "score": 0.7579
45
- },
46
- {
47
- "category": "Computer Science",
48
- "score": 0.7059
49
- },
50
- {
51
- "category": "Economics",
52
- "score": 0.7887
53
- },
54
- {
55
- "category": "Engineering",
56
- "score": 0.5625
57
- },
58
- {
59
- "category": "Health",
60
- "score": 0.6618
61
- },
62
- {
63
- "category": "History",
64
- "score": 0.6552
65
- },
66
- {
67
- "category": "Law",
68
- "score": 0.4944
69
- },
70
- {
71
- "category": "Math",
72
- "score": 0.7788
73
- },
74
- {
75
- "category": "Other",
76
- "score": 0.6494
77
- },
78
- {
79
- "category": "Philosophy",
80
- "score": 0.5476
81
- },
82
- {
83
- "category": "Physics",
84
- "score": 0.7523
85
- },
86
- {
87
- "category": "Psychology",
88
- "score": 0.7164
89
- }
90
- ],
91
- "unified_exam_results": [
92
- {
93
- "category": "Average",
94
- "score": 10.6667
95
- },
96
- {
97
- "category": "Armenian language and literature",
98
- "score": 10.0
99
- },
100
- {
101
- "category": "Armenian history",
102
- "score": 9.25
103
- },
104
- {
105
- "category": "Mathematics",
106
- "score": 12.75
107
- }
108
- ]
109
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  },
111
- {
112
- "model_name": "gemini-2.0-flash",
113
- "results": {
114
- "mmlu_results": [
115
- {
116
- "category": "Average",
117
- "score": 0.7247
118
- },
119
- {
120
- "category": "Biology",
121
- "score": 0.85
122
- },
123
- {
124
- "category": "Business",
125
- "score": 0.8182
126
- },
127
- {
128
- "category": "Chemistry",
129
- "score": 0.7895
130
- },
131
- {
132
- "category": "Computer Science",
133
- "score": 0.7353
134
- },
135
- {
136
- "category": "Economics",
137
- "score": 0.8169
138
- },
139
- {
140
- "category": "Engineering",
141
- "score": 0.6
142
- },
143
- {
144
- "category": "Health",
145
- "score": 0.75
146
- },
147
- {
148
- "category": "History",
149
- "score": 0.5517
150
- },
151
- {
152
- "category": "Law",
153
- "score": 0.5281
154
- },
155
- {
156
- "category": "Math",
157
- "score": 0.8673
158
- },
159
- {
160
- "category": "Other",
161
- "score": 0.6364
162
- },
163
- {
164
- "category": "Philosophy",
165
- "score": 0.6429
166
- },
167
- {
168
- "category": "Physics",
169
- "score": 0.7982
170
- },
171
- {
172
- "category": "Psychology",
173
- "score": 0.7612
174
- }
175
- ],
176
- "unified_exam_results": [
177
- {
178
- "category": "Average",
179
- "score": 9.8333
180
- },
181
- {
182
- "category": "Armenian language and literature",
183
- "score": 5.5
184
- },
185
- {
186
- "category": "Armenian history",
187
- "score": 6.75
188
- },
189
- {
190
- "category": "Mathematics",
191
- "score": 17.25
192
- }
193
- ]
 
194
  }
 
195
  },
196
- {
197
- "model_name": "gpt-4o",
198
- "results": {
199
- "mmlu_results": [
200
- {
201
- "category": "Average",
202
- "score": 0.6758
203
- },
204
- {
205
- "category": "Biology",
206
- "score": 0.8667
207
- },
208
- {
209
- "category": "Business",
210
- "score": 0.7424
211
- },
212
- {
213
- "category": "Chemistry",
214
- "score": 0.6842
215
- },
216
- {
217
- "category": "Computer Science",
218
- "score": 0.6176
219
- },
220
- {
221
- "category": "Economics",
222
- "score": 0.7887
223
- },
224
- {
225
- "category": "Engineering",
226
- "score": 0.5625
227
- },
228
- {
229
- "category": "Health",
230
- "score": 0.7794
231
- },
232
- {
233
- "category": "History",
234
- "score": 0.5517
235
- },
236
- {
237
- "category": "Law",
238
- "score": 0.5393
239
- },
240
- {
241
- "category": "Math",
242
- "score": 0.7788
243
- },
244
- {
245
- "category": "Other",
246
- "score": 0.5974
247
- },
248
- {
249
- "category": "Philosophy",
250
- "score": 0.5476
251
- },
252
- {
253
- "category": "Physics",
254
- "score": 0.6881
255
- },
256
- {
257
- "category": "Psychology",
258
- "score": 0.7164
259
- }
260
- ],
261
- "unified_exam_results": [
262
- {
263
- "category": "Average",
264
- "score": 8.9167
265
- },
266
- {
267
- "category": "Armenian language and literature",
268
- "score": 6.75
269
- },
270
- {
271
- "category": "Armenian history",
272
- "score": 6.75
273
- },
274
- {
275
- "category": "Mathematics",
276
- "score": 13.25
277
- }
278
- ]
279
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  },
281
- {
282
- "model_name": "qwen-max-2025-01-25",
283
- "results": {
284
- "mmlu_results": [],
285
- "unified_exam_results": [
286
- {
287
- "category": "Average",
288
- "score": 8.6667
289
- },
290
- {
291
- "category": "Armenian language and literature",
292
- "score": 7.25
293
- },
294
- {
295
- "category": "Armenian history",
296
- "score": 4.5
297
- },
298
- {
299
- "category": "Mathematics",
300
- "score": 14.25
301
- }
302
- ]
 
303
  }
 
304
  },
305
- {
306
- "model_name": "gemini-1.5-flash",
307
- "results": {
308
- "mmlu_results": [
309
- {
310
- "category": "Average",
311
- "score": 0.5592
312
- },
313
- {
314
- "category": "Biology",
315
- "score": 0.75
316
- },
317
- {
318
- "category": "Business",
319
- "score": 0.7121
320
- },
321
- {
322
- "category": "Chemistry",
323
- "score": 0.6947
324
- },
325
- {
326
- "category": "Computer Science",
327
- "score": 0.5
328
- },
329
- {
330
- "category": "Economics",
331
- "score": 0.7183
332
- },
333
- {
334
- "category": "Engineering",
335
- "score": 0.4
336
- },
337
- {
338
- "category": "Health",
339
- "score": 0.5
340
- },
341
- {
342
- "category": "History",
343
- "score": 0.4483
344
- },
345
- {
346
- "category": "Law",
347
- "score": 0.2584
348
- },
349
- {
350
- "category": "Math",
351
- "score": 0.8319
352
- },
353
- {
354
- "category": "Other",
355
- "score": 0.3506
356
- },
357
- {
358
- "category": "Philosophy",
359
- "score": 0.3571
360
- },
361
- {
362
- "category": "Physics",
363
- "score": 0.6514
364
- },
365
- {
366
- "category": "Psychology",
367
- "score": 0.6567
368
- }
369
- ],
370
- "unified_exam_results": [
371
- {
372
- "category": "Average",
373
- "score": 7.8333
374
- },
375
- {
376
- "category": "Armenian language and literature",
377
- "score": 4.75
378
- },
379
- {
380
- "category": "Armenian history",
381
- "score": 3.75
382
- },
383
- {
384
- "category": "Mathematics",
385
- "score": 15.0
386
- }
387
- ]
 
388
  }
 
389
  },
390
- {
391
- "model_name": "DeepSeek-V3",
392
- "results": {
393
- "mmlu_results": [
394
- {
395
- "category": "Average",
396
- "score": 0.6633
397
- },
398
- {
399
- "category": "Biology",
400
- "score": 0.8167
401
- },
402
- {
403
- "category": "Business",
404
- "score": 0.8182
405
- },
406
- {
407
- "category": "Chemistry",
408
- "score": 0.6947
409
- },
410
- {
411
- "category": "Computer Science",
412
- "score": 0.7353
413
- },
414
- {
415
- "category": "Economics",
416
- "score": 0.7887
417
- },
418
- {
419
- "category": "Engineering",
420
- "score": 0.5875
421
- },
422
- {
423
- "category": "Health",
424
- "score": 0.6471
425
- },
426
- {
427
- "category": "History",
428
- "score": 0.4828
429
- },
430
- {
431
- "category": "Law",
432
- "score": 0.3596
433
- },
434
- {
435
- "category": "Math",
436
- "score": 0.8584
437
- },
438
- {
439
- "category": "Other",
440
- "score": 0.5455
441
- },
442
- {
443
- "category": "Philosophy",
444
- "score": 0.5476
445
- },
446
- {
447
- "category": "Physics",
448
- "score": 0.6881
449
- },
450
- {
451
- "category": "Psychology",
452
- "score": 0.7164
453
- }
454
- ],
455
- "unified_exam_results": [
456
- {
457
- "category": "Average",
458
- "score": 7.5
459
- },
460
- {
461
- "category": "Armenian language and literature",
462
- "score": 5.25
463
- },
464
- {
465
- "category": "Armenian history",
466
- "score": 5.0
467
- },
468
- {
469
- "category": "Mathematics",
470
- "score": 12.25
471
- }
472
- ]
473
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
  },
475
- {
476
- "model_name": "Meta-Llama-3.3-70B-Instruct",
477
- "results": {
478
- "mmlu_results": [
479
- {
480
- "category": "Average",
481
- "score": 0.5139
482
- },
483
- {
484
- "category": "Biology",
485
- "score": 0.7333
486
- },
487
- {
488
- "category": "Business",
489
- "score": 0.5303
490
- },
491
- {
492
- "category": "Chemistry",
493
- "score": 0.5895
494
- },
495
- {
496
- "category": "Computer Science",
497
- "score": 0.3824
498
- },
499
- {
500
- "category": "Economics",
501
- "score": 0.6338
502
- },
503
- {
504
- "category": "Engineering",
505
- "score": 0.4875
506
- },
507
- {
508
- "category": "Health",
509
- "score": 0.5735
510
- },
511
- {
512
- "category": "History",
513
- "score": 0.4138
514
- },
515
- {
516
- "category": "Law",
517
- "score": 0.3146
518
- },
519
- {
520
- "category": "Math",
521
- "score": 0.6018
522
- },
523
- {
524
- "category": "Other",
525
- "score": 0.3377
526
- },
527
- {
528
- "category": "Philosophy",
529
- "score": 0.4524
530
- },
531
- {
532
- "category": "Physics",
533
- "score": 0.5321
534
- },
535
- {
536
- "category": "Psychology",
537
- "score": 0.6119
538
- }
539
- ],
540
- "unified_exam_results": [
541
- {
542
- "category": "Average",
543
- "score": 7.0833
544
- },
545
- {
546
- "category": "Armenian language and literature",
547
- "score": 4.5
548
- },
549
- {
550
- "category": "Armenian history",
551
- "score": 5.25
552
- },
553
- {
554
- "category": "Mathematics",
555
- "score": 11.5
556
- }
557
- ]
 
558
  }
 
559
  },
560
- {
561
- "model_name": "claude-3-5-haiku-20241022",
562
- "results": {
563
- "mmlu_results": [
564
- {
565
- "category": "Average",
566
- "score": 0.5198
567
- },
568
- {
569
- "category": "Biology",
570
- "score": 0.75
571
- },
572
- {
573
- "category": "Business",
574
- "score": 0.5758
575
- },
576
- {
577
- "category": "Chemistry",
578
- "score": 0.5579
579
- },
580
- {
581
- "category": "Computer Science",
582
- "score": 0.4412
583
- },
584
- {
585
- "category": "Economics",
586
- "score": 0.6901
587
- },
588
- {
589
- "category": "Engineering",
590
- "score": 0.4125
591
- },
592
- {
593
- "category": "Health",
594
- "score": 0.5882
595
- },
596
- {
597
- "category": "History",
598
- "score": 0.5172
599
- },
600
- {
601
- "category": "Law",
602
- "score": 0.2472
603
- },
604
- {
605
- "category": "Math",
606
- "score": 0.6018
607
- },
608
- {
609
- "category": "Other",
610
- "score": 0.3636
611
- },
612
- {
613
- "category": "Philosophy",
614
- "score": 0.4048
615
- },
616
- {
617
- "category": "Physics",
618
- "score": 0.5596
619
- },
620
- {
621
- "category": "Psychology",
622
- "score": 0.5672
623
- }
624
- ],
625
- "unified_exam_results": [
626
- {
627
- "category": "Average",
628
- "score": 6.5
629
- },
630
- {
631
- "category": "Armenian language and literature",
632
- "score": 5.0
633
- },
634
- {
635
- "category": "Armenian history",
636
- "score": 3.75
637
- },
638
- {
639
- "category": "Mathematics",
640
- "score": 10.75
641
- }
642
- ]
643
  }
644
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
645
  ]
 
1
  [
2
+ {
3
+ "model_name": "claude-3-7-sonnet-20250219",
4
+ "results": {
5
+ "mmlu_results": [],
6
+ "unified_exam_results": [
7
+ {
8
+ "category": "Average",
9
+ "score": 11.0833
10
+ },
11
+ {
12
+ "category": "Armenian language and literature",
13
+ "score": 10.5
14
+ },
15
+ {
16
+ "category": "Armenian history",
17
+ "score": 7.75
18
+ },
19
+ {
20
+ "category": "Mathematics",
21
+ "score": 15.0
 
 
22
  }
23
+ ]
24
  },
25
+ "mmlu_results": NaN,
26
+ "unified_exam_results": NaN
27
+ },
28
+ {
29
+ "model_name": "claude-3-5-sonnet-20241022",
30
+ "results": {
31
+ "mmlu_results": [
32
+ {
33
+ "category": "Average",
34
+ "score": 0.6958
35
+ },
36
+ {
37
+ "category": "Biology",
38
+ "score": 0.8667
39
+ },
40
+ {
41
+ "category": "Business",
42
+ "score": 0.803
43
+ },
44
+ {
45
+ "category": "Chemistry",
46
+ "score": 0.7579
47
+ },
48
+ {
49
+ "category": "Computer Science",
50
+ "score": 0.7059
51
+ },
52
+ {
53
+ "category": "Economics",
54
+ "score": 0.7887
55
+ },
56
+ {
57
+ "category": "Engineering",
58
+ "score": 0.5625
59
+ },
60
+ {
61
+ "category": "Health",
62
+ "score": 0.6618
63
+ },
64
+ {
65
+ "category": "History",
66
+ "score": 0.6552
67
+ },
68
+ {
69
+ "category": "Law",
70
+ "score": 0.4944
71
+ },
72
+ {
73
+ "category": "Math",
74
+ "score": 0.7788
75
+ },
76
+ {
77
+ "category": "Other",
78
+ "score": 0.6494
79
+ },
80
+ {
81
+ "category": "Philosophy",
82
+ "score": 0.5476
83
+ },
84
+ {
85
+ "category": "Physics",
86
+ "score": 0.7523
87
+ },
88
+ {
89
+ "category": "Psychology",
90
+ "score": 0.7164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  }
92
+ ],
93
+ "unified_exam_results": [
94
+ {
95
+ "category": "Average",
96
+ "score": 10.6667
97
+ },
98
+ {
99
+ "category": "Armenian language and literature",
100
+ "score": 10.0
101
+ },
102
+ {
103
+ "category": "Armenian history",
104
+ "score": 9.25
105
+ },
106
+ {
107
+ "category": "Mathematics",
108
+ "score": 12.75
109
+ }
110
+ ]
111
  },
112
+ "mmlu_results": NaN,
113
+ "unified_exam_results": NaN
114
+ },
115
+ {
116
+ "model_name": "gemini-2.0-flash",
117
+ "results": {
118
+ "mmlu_results": [
119
+ {
120
+ "category": "Average",
121
+ "score": 0.7247
122
+ },
123
+ {
124
+ "category": "Biology",
125
+ "score": 0.85
126
+ },
127
+ {
128
+ "category": "Business",
129
+ "score": 0.8182
130
+ },
131
+ {
132
+ "category": "Chemistry",
133
+ "score": 0.7895
134
+ },
135
+ {
136
+ "category": "Computer Science",
137
+ "score": 0.7353
138
+ },
139
+ {
140
+ "category": "Economics",
141
+ "score": 0.8169
142
+ },
143
+ {
144
+ "category": "Engineering",
145
+ "score": 0.6
146
+ },
147
+ {
148
+ "category": "Health",
149
+ "score": 0.75
150
+ },
151
+ {
152
+ "category": "History",
153
+ "score": 0.5517
154
+ },
155
+ {
156
+ "category": "Law",
157
+ "score": 0.5281
158
+ },
159
+ {
160
+ "category": "Math",
161
+ "score": 0.8673
162
+ },
163
+ {
164
+ "category": "Other",
165
+ "score": 0.6364
166
+ },
167
+ {
168
+ "category": "Philosophy",
169
+ "score": 0.6429
170
+ },
171
+ {
172
+ "category": "Physics",
173
+ "score": 0.7982
174
+ },
175
+ {
176
+ "category": "Psychology",
177
+ "score": 0.7612
178
+ }
179
+ ],
180
+ "unified_exam_results": [
181
+ {
182
+ "category": "Average",
183
+ "score": 9.8333
184
+ },
185
+ {
186
+ "category": "Armenian language and literature",
187
+ "score": 5.5
188
+ },
189
+ {
190
+ "category": "Armenian history",
191
+ "score": 6.75
192
+ },
193
+ {
194
+ "category": "Mathematics",
195
+ "score": 17.25
196
  }
197
+ ]
198
  },
199
+ "mmlu_results": NaN,
200
+ "unified_exam_results": NaN
201
+ },
202
+ {
203
+ "model_name": "gpt-4o",
204
+ "results": {
205
+ "mmlu_results": [
206
+ {
207
+ "category": "Average",
208
+ "score": 0.6758
209
+ },
210
+ {
211
+ "category": "Biology",
212
+ "score": 0.8667
213
+ },
214
+ {
215
+ "category": "Business",
216
+ "score": 0.7424
217
+ },
218
+ {
219
+ "category": "Chemistry",
220
+ "score": 0.6842
221
+ },
222
+ {
223
+ "category": "Computer Science",
224
+ "score": 0.6176
225
+ },
226
+ {
227
+ "category": "Economics",
228
+ "score": 0.7887
229
+ },
230
+ {
231
+ "category": "Engineering",
232
+ "score": 0.5625
233
+ },
234
+ {
235
+ "category": "Health",
236
+ "score": 0.7794
237
+ },
238
+ {
239
+ "category": "History",
240
+ "score": 0.5517
241
+ },
242
+ {
243
+ "category": "Law",
244
+ "score": 0.5393
245
+ },
246
+ {
247
+ "category": "Math",
248
+ "score": 0.7788
249
+ },
250
+ {
251
+ "category": "Other",
252
+ "score": 0.5974
253
+ },
254
+ {
255
+ "category": "Philosophy",
256
+ "score": 0.5476
257
+ },
258
+ {
259
+ "category": "Physics",
260
+ "score": 0.6881
261
+ },
262
+ {
263
+ "category": "Psychology",
264
+ "score": 0.7164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  }
266
+ ],
267
+ "unified_exam_results": [
268
+ {
269
+ "category": "Average",
270
+ "score": 8.9167
271
+ },
272
+ {
273
+ "category": "Armenian language and literature",
274
+ "score": 6.75
275
+ },
276
+ {
277
+ "category": "Armenian history",
278
+ "score": 6.75
279
+ },
280
+ {
281
+ "category": "Mathematics",
282
+ "score": 13.25
283
+ }
284
+ ]
285
  },
286
+ "mmlu_results": NaN,
287
+ "unified_exam_results": NaN
288
+ },
289
+ {
290
+ "model_name": "qwen-max-2025-01-25",
291
+ "results": {
292
+ "mmlu_results": [],
293
+ "unified_exam_results": [
294
+ {
295
+ "category": "Average",
296
+ "score": 8.6667
297
+ },
298
+ {
299
+ "category": "Armenian language and literature",
300
+ "score": 7.25
301
+ },
302
+ {
303
+ "category": "Armenian history",
304
+ "score": 4.5
305
+ },
306
+ {
307
+ "category": "Mathematics",
308
+ "score": 14.25
309
  }
310
+ ]
311
  },
312
+ "mmlu_results": NaN,
313
+ "unified_exam_results": NaN
314
+ },
315
+ {
316
+ "model_name": "gemini-1.5-flash",
317
+ "results": {
318
+ "mmlu_results": [
319
+ {
320
+ "category": "Average",
321
+ "score": 0.5592
322
+ },
323
+ {
324
+ "category": "Biology",
325
+ "score": 0.75
326
+ },
327
+ {
328
+ "category": "Business",
329
+ "score": 0.7121
330
+ },
331
+ {
332
+ "category": "Chemistry",
333
+ "score": 0.6947
334
+ },
335
+ {
336
+ "category": "Computer Science",
337
+ "score": 0.5
338
+ },
339
+ {
340
+ "category": "Economics",
341
+ "score": 0.7183
342
+ },
343
+ {
344
+ "category": "Engineering",
345
+ "score": 0.4
346
+ },
347
+ {
348
+ "category": "Health",
349
+ "score": 0.5
350
+ },
351
+ {
352
+ "category": "History",
353
+ "score": 0.4483
354
+ },
355
+ {
356
+ "category": "Law",
357
+ "score": 0.2584
358
+ },
359
+ {
360
+ "category": "Math",
361
+ "score": 0.8319
362
+ },
363
+ {
364
+ "category": "Other",
365
+ "score": 0.3506
366
+ },
367
+ {
368
+ "category": "Philosophy",
369
+ "score": 0.3571
370
+ },
371
+ {
372
+ "category": "Physics",
373
+ "score": 0.6514
374
+ },
375
+ {
376
+ "category": "Psychology",
377
+ "score": 0.6567
378
+ }
379
+ ],
380
+ "unified_exam_results": [
381
+ {
382
+ "category": "Average",
383
+ "score": 7.8333
384
+ },
385
+ {
386
+ "category": "Armenian language and literature",
387
+ "score": 4.75
388
+ },
389
+ {
390
+ "category": "Armenian history",
391
+ "score": 3.75
392
+ },
393
+ {
394
+ "category": "Mathematics",
395
+ "score": 15.0
396
  }
397
+ ]
398
  },
399
+ "mmlu_results": NaN,
400
+ "unified_exam_results": NaN
401
+ },
402
+ {
403
+ "model_name": "DeepSeek-V3",
404
+ "results": {
405
+ "mmlu_results": [
406
+ {
407
+ "category": "Average",
408
+ "score": 0.6633
409
+ },
410
+ {
411
+ "category": "Biology",
412
+ "score": 0.8167
413
+ },
414
+ {
415
+ "category": "Business",
416
+ "score": 0.8182
417
+ },
418
+ {
419
+ "category": "Chemistry",
420
+ "score": 0.6947
421
+ },
422
+ {
423
+ "category": "Computer Science",
424
+ "score": 0.7353
425
+ },
426
+ {
427
+ "category": "Economics",
428
+ "score": 0.7887
429
+ },
430
+ {
431
+ "category": "Engineering",
432
+ "score": 0.5875
433
+ },
434
+ {
435
+ "category": "Health",
436
+ "score": 0.6471
437
+ },
438
+ {
439
+ "category": "History",
440
+ "score": 0.4828
441
+ },
442
+ {
443
+ "category": "Law",
444
+ "score": 0.3596
445
+ },
446
+ {
447
+ "category": "Math",
448
+ "score": 0.8584
449
+ },
450
+ {
451
+ "category": "Other",
452
+ "score": 0.5455
453
+ },
454
+ {
455
+ "category": "Philosophy",
456
+ "score": 0.5476
457
+ },
458
+ {
459
+ "category": "Physics",
460
+ "score": 0.6881
461
+ },
462
+ {
463
+ "category": "Psychology",
464
+ "score": 0.7164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
  }
466
+ ],
467
+ "unified_exam_results": [
468
+ {
469
+ "category": "Average",
470
+ "score": 7.5
471
+ },
472
+ {
473
+ "category": "Armenian language and literature",
474
+ "score": 5.25
475
+ },
476
+ {
477
+ "category": "Armenian history",
478
+ "score": 5.0
479
+ },
480
+ {
481
+ "category": "Mathematics",
482
+ "score": 12.25
483
+ }
484
+ ]
485
  },
486
+ "mmlu_results": NaN,
487
+ "unified_exam_results": NaN
488
+ },
489
+ {
490
+ "model_name": "Meta-Llama-3.3-70B-Instruct",
491
+ "results": {
492
+ "mmlu_results": [
493
+ {
494
+ "category": "Average",
495
+ "score": 0.5139
496
+ },
497
+ {
498
+ "category": "Biology",
499
+ "score": 0.7333
500
+ },
501
+ {
502
+ "category": "Business",
503
+ "score": 0.5303
504
+ },
505
+ {
506
+ "category": "Chemistry",
507
+ "score": 0.5895
508
+ },
509
+ {
510
+ "category": "Computer Science",
511
+ "score": 0.3824
512
+ },
513
+ {
514
+ "category": "Economics",
515
+ "score": 0.6338
516
+ },
517
+ {
518
+ "category": "Engineering",
519
+ "score": 0.4875
520
+ },
521
+ {
522
+ "category": "Health",
523
+ "score": 0.5735
524
+ },
525
+ {
526
+ "category": "History",
527
+ "score": 0.4138
528
+ },
529
+ {
530
+ "category": "Law",
531
+ "score": 0.3146
532
+ },
533
+ {
534
+ "category": "Math",
535
+ "score": 0.6018
536
+ },
537
+ {
538
+ "category": "Other",
539
+ "score": 0.3377
540
+ },
541
+ {
542
+ "category": "Philosophy",
543
+ "score": 0.4524
544
+ },
545
+ {
546
+ "category": "Physics",
547
+ "score": 0.5321
548
+ },
549
+ {
550
+ "category": "Psychology",
551
+ "score": 0.6119
552
+ }
553
+ ],
554
+ "unified_exam_results": [
555
+ {
556
+ "category": "Average",
557
+ "score": 7.0833
558
+ },
559
+ {
560
+ "category": "Armenian language and literature",
561
+ "score": 4.5
562
+ },
563
+ {
564
+ "category": "Armenian history",
565
+ "score": 5.25
566
+ },
567
+ {
568
+ "category": "Mathematics",
569
+ "score": 11.5
570
  }
571
+ ]
572
  },
573
+ "mmlu_results": NaN,
574
+ "unified_exam_results": NaN
575
+ },
576
+ {
577
+ "model_name": "claude-3-5-haiku-20241022",
578
+ "results": {
579
+ "mmlu_results": [
580
+ {
581
+ "category": "Average",
582
+ "score": 0.5198
583
+ },
584
+ {
585
+ "category": "Biology",
586
+ "score": 0.75
587
+ },
588
+ {
589
+ "category": "Business",
590
+ "score": 0.5758
591
+ },
592
+ {
593
+ "category": "Chemistry",
594
+ "score": 0.5579
595
+ },
596
+ {
597
+ "category": "Computer Science",
598
+ "score": 0.4412
599
+ },
600
+ {
601
+ "category": "Economics",
602
+ "score": 0.6901
603
+ },
604
+ {
605
+ "category": "Engineering",
606
+ "score": 0.4125
607
+ },
608
+ {
609
+ "category": "Health",
610
+ "score": 0.5882
611
+ },
612
+ {
613
+ "category": "History",
614
+ "score": 0.5172
615
+ },
616
+ {
617
+ "category": "Law",
618
+ "score": 0.2472
619
+ },
620
+ {
621
+ "category": "Math",
622
+ "score": 0.6018
623
+ },
624
+ {
625
+ "category": "Other",
626
+ "score": 0.3636
627
+ },
628
+ {
629
+ "category": "Philosophy",
630
+ "score": 0.4048
631
+ },
632
+ {
633
+ "category": "Physics",
634
+ "score": 0.5596
635
+ },
636
+ {
637
+ "category": "Psychology",
638
+ "score": 0.5672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
639
  }
640
+ ],
641
+ "unified_exam_results": [
642
+ {
643
+ "category": "Average",
644
+ "score": 6.5
645
+ },
646
+ {
647
+ "category": "Armenian language and literature",
648
+ "score": 5.0
649
+ },
650
+ {
651
+ "category": "Armenian history",
652
+ "score": 3.75
653
+ },
654
+ {
655
+ "category": "Mathematics",
656
+ "score": 10.75
657
+ }
658
+ ]
659
+ },
660
+ "mmlu_results": NaN,
661
+ "unified_exam_results": NaN
662
+ },
663
+ {
664
+ "model_name": "Gen2B/HyGPT-10b-it",
665
+ "results": NaN,
666
+ "mmlu_results": [],
667
+ "unified_exam_results": [
668
+ {
669
+ "category": "Armenian language and literature",
670
+ "score": 4.5
671
+ },
672
+ {
673
+ "category": "Armenian history",
674
+ "score": 4.25
675
+ },
676
+ {
677
+ "category": "Mathematics",
678
+ "score": 3.0
679
+ },
680
+ {
681
+ "category": "Average",
682
+ "score": 3.9167
683
+ }
684
+ ]
685
+ },
686
+ {
687
+ "model_name": "google/gemma-2-9b-it",
688
+ "results": NaN,
689
+ "mmlu_results": [],
690
+ "unified_exam_results": [
691
+ {
692
+ "category": "Armenian language and literature",
693
+ "score": 3.25
694
+ },
695
+ {
696
+ "category": "Armenian history",
697
+ "score": 1.75
698
+ },
699
+ {
700
+ "category": "Mathematics",
701
+ "score": 2.0
702
+ },
703
+ {
704
+ "category": "Average",
705
+ "score": 2.3333
706
+ }
707
+ ]
708
+ },
709
+ {
710
+ "model_name": "google/gemma-3-27b-it",
711
+ "results": NaN,
712
+ "mmlu_results": [],
713
+ "unified_exam_results": [
714
+ {
715
+ "category": "Armenian language and literature",
716
+ "score": 1.75
717
+ },
718
+ {
719
+ "category": "Armenian history",
720
+ "score": 1.0
721
+ },
722
+ {
723
+ "category": "Mathematics",
724
+ "score": 0.75
725
+ },
726
+ {
727
+ "category": "Average",
728
+ "score": 1.1667
729
+ }
730
+ ]
731
+ },
732
+ {
733
+ "model_name": "google/gemma-3n-E2B-it",
734
+ "results": NaN,
735
+ "mmlu_results": [],
736
+ "unified_exam_results": [
737
+ {
738
+ "category": "Armenian language and literature",
739
+ "score": 2.25
740
+ },
741
+ {
742
+ "category": "Armenian history",
743
+ "score": 1.5
744
+ },
745
+ {
746
+ "category": "Mathematics",
747
+ "score": 4.25
748
+ },
749
+ {
750
+ "category": "Average",
751
+ "score": 2.6667
752
+ }
753
+ ]
754
+ },
755
+ {
756
+ "model_name": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
757
+ "results": NaN,
758
+ "mmlu_results": [],
759
+ "unified_exam_results": [
760
+ {
761
+ "category": "Armenian language and literature",
762
+ "score": 6.25
763
+ },
764
+ {
765
+ "category": "Armenian history",
766
+ "score": 5.0
767
+ },
768
+ {
769
+ "category": "Mathematics",
770
+ "score": 12.5
771
+ },
772
+ {
773
+ "category": "Average",
774
+ "score": 7.9167
775
+ }
776
+ ]
777
+ },
778
+ {
779
+ "model_name": "Qwen/Qwen3-32B",
780
+ "results": NaN,
781
+ "mmlu_results": [],
782
+ "unified_exam_results": [
783
+ {
784
+ "category": "Armenian language and literature",
785
+ "score": 4.75
786
+ },
787
+ {
788
+ "category": "Armenian history",
789
+ "score": 3.5
790
+ },
791
+ {
792
+ "category": "Mathematics",
793
+ "score": 14.0
794
+ },
795
+ {
796
+ "category": "Average",
797
+ "score": 7.4167
798
+ }
799
+ ]
800
+ },
801
+ {
802
+ "model_name": "Qwen/QwQ-32B",
803
+ "results": NaN,
804
+ "mmlu_results": [],
805
+ "unified_exam_results": [
806
+ {
807
+ "category": "Armenian language and literature",
808
+ "score": 2.5
809
+ },
810
+ {
811
+ "category": "Armenian history",
812
+ "score": 2.5
813
+ },
814
+ {
815
+ "category": "Mathematics",
816
+ "score": 10.5
817
+ },
818
+ {
819
+ "category": "Average",
820
+ "score": 5.1667
821
+ }
822
+ ]
823
+ }
824
  ]