Spaces:
Running
Running
mmlu api models update
Browse files- data_handler.py +0 -1
- model_results.json +186 -3
data_handler.py
CHANGED
@@ -134,4 +134,3 @@ def mmlu_chart(mmlu_df, plot_column):
|
|
134 |
fig.update_yaxes(tickfont=dict(size=10))
|
135 |
|
136 |
return fig
|
137 |
-
|
|
|
134 |
fig.update_yaxes(tickfont=dict(size=10))
|
135 |
|
136 |
return fig
|
|
model_results.json
CHANGED
@@ -837,7 +837,68 @@
|
|
837 |
{
|
838 |
"model_name": "gemini-2.5-pro",
|
839 |
"results": {
|
840 |
-
"mmlu_results": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
841 |
"unified_exam_results": [
|
842 |
{
|
843 |
"category": "Average",
|
@@ -861,7 +922,68 @@
|
|
861 |
{
|
862 |
"model_name": "gpt-4.1-2025-04-14",
|
863 |
"results": {
|
864 |
-
"mmlu_results": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
865 |
"unified_exam_results": [
|
866 |
{
|
867 |
"category": "Average",
|
@@ -885,7 +1007,68 @@
|
|
885 |
{
|
886 |
"model_name": "claude-sonnet-4-20250514",
|
887 |
"results": {
|
888 |
-
"mmlu_results": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
889 |
"unified_exam_results": [
|
890 |
{
|
891 |
"category": "Average",
|
|
|
837 |
{
|
838 |
"model_name": "gemini-2.5-pro",
|
839 |
"results": {
|
840 |
+
"mmlu_results": [
|
841 |
+
{
|
842 |
+
"category": "Average",
|
843 |
+
"score": 0.8241
|
844 |
+
},
|
845 |
+
{
|
846 |
+
"category": "Biology",
|
847 |
+
"score": 0.8833
|
848 |
+
},
|
849 |
+
{
|
850 |
+
"category": "Business",
|
851 |
+
"score": 0.9242
|
852 |
+
},
|
853 |
+
{
|
854 |
+
"category": "Chemistry",
|
855 |
+
"score": 0.8947
|
856 |
+
},
|
857 |
+
{
|
858 |
+
"category": "Computer Science",
|
859 |
+
"score": 0.8529
|
860 |
+
},
|
861 |
+
{
|
862 |
+
"category": "Economics",
|
863 |
+
"score": 0.8873
|
864 |
+
},
|
865 |
+
{
|
866 |
+
"category": "Engineering",
|
867 |
+
"score": 0.725
|
868 |
+
},
|
869 |
+
{
|
870 |
+
"category": "Health",
|
871 |
+
"score": 0.8088
|
872 |
+
},
|
873 |
+
{
|
874 |
+
"category": "History",
|
875 |
+
"score": 0.6552
|
876 |
+
},
|
877 |
+
{
|
878 |
+
"category": "Law",
|
879 |
+
"score": 0.6517
|
880 |
+
},
|
881 |
+
{
|
882 |
+
"category": "Math",
|
883 |
+
"score": 0.823
|
884 |
+
},
|
885 |
+
{
|
886 |
+
"category": "Other",
|
887 |
+
"score": 0.8312
|
888 |
+
},
|
889 |
+
{
|
890 |
+
"category": "Philosophy",
|
891 |
+
"score": 0.8333
|
892 |
+
},
|
893 |
+
{
|
894 |
+
"category": "Physics",
|
895 |
+
"score": 0.8716
|
896 |
+
},
|
897 |
+
{
|
898 |
+
"category": "Psychology",
|
899 |
+
"score": 0.8955
|
900 |
+
}
|
901 |
+
],
|
902 |
"unified_exam_results": [
|
903 |
{
|
904 |
"category": "Average",
|
|
|
922 |
{
|
923 |
"model_name": "gpt-4.1-2025-04-14",
|
924 |
"results": {
|
925 |
+
"mmlu_results": [
|
926 |
+
{
|
927 |
+
"category": "Average",
|
928 |
+
"score": 0.756
|
929 |
+
},
|
930 |
+
{
|
931 |
+
"category": "Biology",
|
932 |
+
"score": 0.8667
|
933 |
+
},
|
934 |
+
{
|
935 |
+
"category": "Business",
|
936 |
+
"score": 0.8939
|
937 |
+
},
|
938 |
+
{
|
939 |
+
"category": "Chemistry",
|
940 |
+
"score": 0.8632
|
941 |
+
},
|
942 |
+
{
|
943 |
+
"category": "Computer Science",
|
944 |
+
"score": 0.7353
|
945 |
+
},
|
946 |
+
{
|
947 |
+
"category": "Economics",
|
948 |
+
"score": 0.8732
|
949 |
+
},
|
950 |
+
{
|
951 |
+
"category": "Engineering",
|
952 |
+
"score": 0.625
|
953 |
+
},
|
954 |
+
{
|
955 |
+
"category": "Health",
|
956 |
+
"score": 0.7353
|
957 |
+
},
|
958 |
+
{
|
959 |
+
"category": "History",
|
960 |
+
"score": 0.6897
|
961 |
+
},
|
962 |
+
{
|
963 |
+
"category": "Law",
|
964 |
+
"score": 0.573
|
965 |
+
},
|
966 |
+
{
|
967 |
+
"category": "Math",
|
968 |
+
"score": 0.8496
|
969 |
+
},
|
970 |
+
{
|
971 |
+
"category": "Other",
|
972 |
+
"score": 0.6494
|
973 |
+
},
|
974 |
+
{
|
975 |
+
"category": "Philosophy",
|
976 |
+
"score": 0.6429
|
977 |
+
},
|
978 |
+
{
|
979 |
+
"category": "Physics",
|
980 |
+
"score": 0.8257
|
981 |
+
},
|
982 |
+
{
|
983 |
+
"category": "Psychology",
|
984 |
+
"score": 0.7612
|
985 |
+
}
|
986 |
+
],
|
987 |
"unified_exam_results": [
|
988 |
{
|
989 |
"category": "Average",
|
|
|
1007 |
{
|
1008 |
"model_name": "claude-sonnet-4-20250514",
|
1009 |
"results": {
|
1010 |
+
"mmlu_results": [
|
1011 |
+
{
|
1012 |
+
"category": "Average",
|
1013 |
+
"score": 0.7459
|
1014 |
+
},
|
1015 |
+
{
|
1016 |
+
"category": "Biology",
|
1017 |
+
"score": 0.8167
|
1018 |
+
},
|
1019 |
+
{
|
1020 |
+
"category": "Business",
|
1021 |
+
"score": 0.8788
|
1022 |
+
},
|
1023 |
+
{
|
1024 |
+
"category": "Chemistry",
|
1025 |
+
"score": 0.7789
|
1026 |
+
},
|
1027 |
+
{
|
1028 |
+
"category": "Computer Science",
|
1029 |
+
"score": 0.8824
|
1030 |
+
},
|
1031 |
+
{
|
1032 |
+
"category": "Economics",
|
1033 |
+
"score": 0.8873
|
1034 |
+
},
|
1035 |
+
{
|
1036 |
+
"category": "Engineering",
|
1037 |
+
"score": 0.6625
|
1038 |
+
},
|
1039 |
+
{
|
1040 |
+
"category": "Health",
|
1041 |
+
"score": 0.7206
|
1042 |
+
},
|
1043 |
+
{
|
1044 |
+
"category": "History",
|
1045 |
+
"score": 0.5517
|
1046 |
+
},
|
1047 |
+
{
|
1048 |
+
"category": "Law",
|
1049 |
+
"score": 0.3933
|
1050 |
+
},
|
1051 |
+
{
|
1052 |
+
"category": "Math",
|
1053 |
+
"score": 0.9027
|
1054 |
+
},
|
1055 |
+
{
|
1056 |
+
"category": "Other",
|
1057 |
+
"score": 0.6883
|
1058 |
+
},
|
1059 |
+
{
|
1060 |
+
"category": "Philosophy",
|
1061 |
+
"score": 0.6667
|
1062 |
+
},
|
1063 |
+
{
|
1064 |
+
"category": "Physics",
|
1065 |
+
"score": 0.8073
|
1066 |
+
},
|
1067 |
+
{
|
1068 |
+
"category": "Psychology",
|
1069 |
+
"score": 0.806
|
1070 |
+
}
|
1071 |
+
],
|
1072 |
"unified_exam_results": [
|
1073 |
{
|
1074 |
"category": "Average",
|