Bagratuni commited on
Commit
95ec4a1
·
1 Parent(s): 33a913f

mmlu api models update

Browse files
Files changed (2) hide show
  1. data_handler.py +0 -1
  2. model_results.json +186 -3
data_handler.py CHANGED
@@ -134,4 +134,3 @@ def mmlu_chart(mmlu_df, plot_column):
134
  fig.update_yaxes(tickfont=dict(size=10))
135
 
136
  return fig
137
-
 
134
  fig.update_yaxes(tickfont=dict(size=10))
135
 
136
  return fig
 
model_results.json CHANGED
@@ -837,7 +837,68 @@
837
  {
838
  "model_name": "gemini-2.5-pro",
839
  "results": {
840
- "mmlu_results": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
841
  "unified_exam_results": [
842
  {
843
  "category": "Average",
@@ -861,7 +922,68 @@
861
  {
862
  "model_name": "gpt-4.1-2025-04-14",
863
  "results": {
864
- "mmlu_results": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
865
  "unified_exam_results": [
866
  {
867
  "category": "Average",
@@ -885,7 +1007,68 @@
885
  {
886
  "model_name": "claude-sonnet-4-20250514",
887
  "results": {
888
- "mmlu_results": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
889
  "unified_exam_results": [
890
  {
891
  "category": "Average",
 
837
  {
838
  "model_name": "gemini-2.5-pro",
839
  "results": {
840
+ "mmlu_results": [
841
+ {
842
+ "category": "Average",
843
+ "score": 0.8241
844
+ },
845
+ {
846
+ "category": "Biology",
847
+ "score": 0.8833
848
+ },
849
+ {
850
+ "category": "Business",
851
+ "score": 0.9242
852
+ },
853
+ {
854
+ "category": "Chemistry",
855
+ "score": 0.8947
856
+ },
857
+ {
858
+ "category": "Computer Science",
859
+ "score": 0.8529
860
+ },
861
+ {
862
+ "category": "Economics",
863
+ "score": 0.8873
864
+ },
865
+ {
866
+ "category": "Engineering",
867
+ "score": 0.725
868
+ },
869
+ {
870
+ "category": "Health",
871
+ "score": 0.8088
872
+ },
873
+ {
874
+ "category": "History",
875
+ "score": 0.6552
876
+ },
877
+ {
878
+ "category": "Law",
879
+ "score": 0.6517
880
+ },
881
+ {
882
+ "category": "Math",
883
+ "score": 0.823
884
+ },
885
+ {
886
+ "category": "Other",
887
+ "score": 0.8312
888
+ },
889
+ {
890
+ "category": "Philosophy",
891
+ "score": 0.8333
892
+ },
893
+ {
894
+ "category": "Physics",
895
+ "score": 0.8716
896
+ },
897
+ {
898
+ "category": "Psychology",
899
+ "score": 0.8955
900
+ }
901
+ ],
902
  "unified_exam_results": [
903
  {
904
  "category": "Average",
 
922
  {
923
  "model_name": "gpt-4.1-2025-04-14",
924
  "results": {
925
+ "mmlu_results": [
926
+ {
927
+ "category": "Average",
928
+ "score": 0.756
929
+ },
930
+ {
931
+ "category": "Biology",
932
+ "score": 0.8667
933
+ },
934
+ {
935
+ "category": "Business",
936
+ "score": 0.8939
937
+ },
938
+ {
939
+ "category": "Chemistry",
940
+ "score": 0.8632
941
+ },
942
+ {
943
+ "category": "Computer Science",
944
+ "score": 0.7353
945
+ },
946
+ {
947
+ "category": "Economics",
948
+ "score": 0.8732
949
+ },
950
+ {
951
+ "category": "Engineering",
952
+ "score": 0.625
953
+ },
954
+ {
955
+ "category": "Health",
956
+ "score": 0.7353
957
+ },
958
+ {
959
+ "category": "History",
960
+ "score": 0.6897
961
+ },
962
+ {
963
+ "category": "Law",
964
+ "score": 0.573
965
+ },
966
+ {
967
+ "category": "Math",
968
+ "score": 0.8496
969
+ },
970
+ {
971
+ "category": "Other",
972
+ "score": 0.6494
973
+ },
974
+ {
975
+ "category": "Philosophy",
976
+ "score": 0.6429
977
+ },
978
+ {
979
+ "category": "Physics",
980
+ "score": 0.8257
981
+ },
982
+ {
983
+ "category": "Psychology",
984
+ "score": 0.7612
985
+ }
986
+ ],
987
  "unified_exam_results": [
988
  {
989
  "category": "Average",
 
1007
  {
1008
  "model_name": "claude-sonnet-4-20250514",
1009
  "results": {
1010
+ "mmlu_results": [
1011
+ {
1012
+ "category": "Average",
1013
+ "score": 0.7459
1014
+ },
1015
+ {
1016
+ "category": "Biology",
1017
+ "score": 0.8167
1018
+ },
1019
+ {
1020
+ "category": "Business",
1021
+ "score": 0.8788
1022
+ },
1023
+ {
1024
+ "category": "Chemistry",
1025
+ "score": 0.7789
1026
+ },
1027
+ {
1028
+ "category": "Computer Science",
1029
+ "score": 0.8824
1030
+ },
1031
+ {
1032
+ "category": "Economics",
1033
+ "score": 0.8873
1034
+ },
1035
+ {
1036
+ "category": "Engineering",
1037
+ "score": 0.6625
1038
+ },
1039
+ {
1040
+ "category": "Health",
1041
+ "score": 0.7206
1042
+ },
1043
+ {
1044
+ "category": "History",
1045
+ "score": 0.5517
1046
+ },
1047
+ {
1048
+ "category": "Law",
1049
+ "score": 0.3933
1050
+ },
1051
+ {
1052
+ "category": "Math",
1053
+ "score": 0.9027
1054
+ },
1055
+ {
1056
+ "category": "Other",
1057
+ "score": 0.6883
1058
+ },
1059
+ {
1060
+ "category": "Philosophy",
1061
+ "score": 0.6667
1062
+ },
1063
+ {
1064
+ "category": "Physics",
1065
+ "score": 0.8073
1066
+ },
1067
+ {
1068
+ "category": "Psychology",
1069
+ "score": 0.806
1070
+ }
1071
+ ],
1072
  "unified_exam_results": [
1073
  {
1074
  "category": "Average",