diff --git a/results/cross_lingual/few_shot/cross_logiqa.csv b/results/cross_lingual/few_shot/cross_logiqa.csv index be33b49f018e61bffb67aba31e9dbe3c22c4e3f2..55b25bffcb3b1df623a69b55f4153537b9e6eac2 100644 --- a/results/cross_lingual/few_shot/cross_logiqa.csv +++ b/results/cross_lingual/few_shot/cross_logiqa.csv @@ -1,2 +1,4 @@ Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino +Meta-Llama-3-70B,0.6152597402597404,0.49480519480519464,0.5484971301967684,0.7272727272727273,0.6534090909090909,0.625,0.5681818181818182,0.6136363636363636,0.5795454545454546,0.5397727272727273 Meta-Llama-3-8B,0.44967532467532456,0.2623376623376623,0.33136129711503204,0.5227272727272727,0.4431818181818182,0.44886363636363635,0.44886363636363635,0.3693181818181818,0.4602272727272727,0.45454545454545453 +Meta-Llama-3.1-8B,0.46266233766233766,0.277435064935065,0.34686989908229837,0.5284090909090909,0.5,0.4375,0.4772727272727273,0.4318181818181818,0.4431818181818182,0.42045454545454547 diff --git a/results/cross_lingual/few_shot/cross_mmlu.csv b/results/cross_lingual/few_shot/cross_mmlu.csv index 523b1a9f90c8f2daba0c682e477f044e0522d413..401c2e9ccac8f99d519a1d282121e2d80acef223 100644 --- a/results/cross_lingual/few_shot/cross_mmlu.csv +++ b/results/cross_lingual/few_shot/cross_mmlu.csv @@ -1,2 +1,4 @@ Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino +Meta-Llama-3-70B,0.7552380952380952,0.6674285714285715,0.708623453080271,0.8066666666666666,0.7266666666666667,0.7866666666666666,0.7533333333333333,0.7733333333333333,0.72,0.72 Meta-Llama-3-8B,0.5295238095238096,0.31923809523809527,0.3983311959862401,0.6266666666666667,0.5466666666666666,0.56,0.4866666666666667,0.5266666666666666,0.5,0.46 +Meta-Llama-3.1-8B,0.5342857142857141,0.2960000000000001,0.3809497590731823,0.6733333333333333,0.5533333333333333,0.5133333333333333,0.47333333333333333,0.5133333333333333,0.5,0.5133333333333333 diff --git a/results/cross_lingual/few_shot/cross_xquad.csv b/results/cross_lingual/few_shot/cross_xquad.csv index 47f23ab1b3c8b57b62758f43972cbaf9c94bdad4..ea076a2307d40cb285aafff84d82652d1d4fd17e 100644 --- a/results/cross_lingual/few_shot/cross_xquad.csv +++ b/results/cross_lingual/few_shot/cross_xquad.csv @@ -1,3 +1,4 @@ Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino Meta-Llama-3-70B,0.9596638655462185,0.9359243697478992,0.9476454662047799,0.9697478991596639,0.9504201680672268,0.957983193277311,0.9605042016806723,,, Meta-Llama-3-8B,0.8928571428571429,0.8163865546218487,0.8529112234365448,0.926890756302521,0.8823529411764706,0.888235294117647,0.8739495798319328,,, +Meta-Llama-3.1-8B,0.9052521008403361,0.8355042016806722,0.8689808363106925,0.9352941176470588,0.8932773109243698,0.9,0.892436974789916,,, diff --git a/results/cross_lingual/zero_shot/cross_logiqa.csv b/results/cross_lingual/zero_shot/cross_logiqa.csv index 97abcd73442dc9bfd8da75460aefacdfd6354be5..6a8836e9b87dc251991571828357ee54d6677019 100644 --- a/results/cross_lingual/zero_shot/cross_logiqa.csv +++ b/results/cross_lingual/zero_shot/cross_logiqa.csv @@ -1,5 +1,6 @@ Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino Qwen2-7B-Instruct,0.5673701298701299,0.477922077922078,0.5188189663543613,0.6590909090909091,0.6704545454545454,0.5340909090909091,0.5625,0.5340909090909091,0.5397727272727273,0.4715909090909091 +Meta-Llama-3.1-8B-Instruct,0.43993506493506496,0.33425324675324675,0.37988102268160845,0.5113636363636364,0.45454545454545453,0.4772727272727273,0.48295454545454547,0.3977272727272727,0.39204545454545453,0.36363636363636365 Qwen2-72B-Instruct,0.6753246753246753,0.6814935064935067,0.6783950674333673,0.75,0.8125,0.6647727272727273,0.6136363636363636,0.6420454545454546,0.6590909090909091,0.5852272727272727 Meta-Llama-3-8B-Instruct,0.4115259740259741,0.34042207792207796,0.3726122484532397,0.48863636363636365,0.4659090909090909,0.42613636363636365,0.4034090909090909,0.4034090909090909,0.36363636363636365,0.32954545454545453 Meta-Llama-3-70B-Instruct,0.6290584415584416,0.6181818181818182,0.6235727047409828,0.6988636363636364,0.6875,0.6420454545454546,0.6193181818181818,0.6022727272727273,0.6136363636363636,0.5397727272727273 diff --git a/results/cross_lingual/zero_shot/cross_mmlu.csv b/results/cross_lingual/zero_shot/cross_mmlu.csv index 2efe436caa86f564a5ac88181441d7495afaa3ec..b9864dc4d883b7ebfc072ccc5ac0095f89b75fbd 100644 --- a/results/cross_lingual/zero_shot/cross_mmlu.csv +++ b/results/cross_lingual/zero_shot/cross_mmlu.csv @@ -1,5 +1,6 @@ Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino Qwen2-7B-Instruct,0.6495238095238095,0.529714285714286,0.5835327779462245,0.74,0.6733333333333333,0.7,0.6,0.6533333333333333,0.6333333333333333,0.5466666666666666 +Meta-Llama-3.1-8B-Instruct,0.5771428571428572,0.47047619047619055,0.5183792207297393,0.6933333333333334,0.5333333333333333,0.6266666666666667,0.54,0.54,0.54,0.5666666666666667 Qwen2-72B-Instruct,0.7714285714285715,0.7765714285714286,0.773991456997936,0.8,0.78,0.7866666666666666,0.7333333333333333,0.76,0.78,0.76 Meta-Llama-3-8B-Instruct,0.5276190476190475,0.3792380952380953,0.4412894449458876,0.62,0.5066666666666667,0.5066666666666667,0.5466666666666666,0.49333333333333335,0.52,0.5 Meta-Llama-3-70B-Instruct,0.7542857142857143,0.7228571428571428,0.7382370820168919,0.7933333333333333,0.74,0.7666666666666667,0.7466666666666667,0.7666666666666667,0.72,0.7466666666666667 diff --git a/results/cross_lingual/zero_shot/cross_xquad.csv b/results/cross_lingual/zero_shot/cross_xquad.csv index 9469a0d301f598d405b7af680360e3b0f4ef85f2..fb5e9fba95d72a69f67c4e793e3d6b3f09330048 100644 --- a/results/cross_lingual/zero_shot/cross_xquad.csv +++ b/results/cross_lingual/zero_shot/cross_xquad.csv @@ -1,5 +1,6 @@ Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino Qwen2-7B-Instruct,0.940546218487395,0.9016806722689076,0.9207034712119446,0.9521008403361344,0.9352941176470588,0.9445378151260504,0.9302521008403362,,, +Meta-Llama-3.1-8B-Instruct,0.9340336134453782,0.8831932773109243,0.9079022683718587,0.9369747899159664,0.9302521008403362,0.946218487394958,0.9226890756302522,,, Qwen2-72B-Instruct,0.9611344537815126,0.9506302521008403,0.9558534951942531,0.9638655462184874,0.9554621848739496,0.9613445378151261,0.9638655462184874,,, Meta-Llama-3-8B-Instruct,0.8756302521008403,0.7699579831932772,0.8194012188828194,0.8815126050420168,0.8420168067226891,0.9092436974789916,0.8697478991596639,,, Meta-Llama-3-70B-Instruct,0.9586134453781513,0.9434873949579832,0.9509902767764395,0.9705882352941176,0.9394957983193277,0.9596638655462185,0.9647058823529412,,, diff --git a/results/cultural_reasoning/few_shot/cn_eval.csv b/results/cultural_reasoning/few_shot/cn_eval.csv index 7f2af95b8bcc95e96f349a49f75ac662d828e585..d2e2a5e57ebbe9495d252bd892fde3f15c3893c8 100644 --- a/results/cultural_reasoning/few_shot/cn_eval.csv +++ b/results/cultural_reasoning/few_shot/cn_eval.csv @@ -1,2 +1,4 @@ Model,Accuracy +Meta-Llama-3-70B,0.6 Meta-Llama-3-8B,0.41904761904761906 +Meta-Llama-3.1-8B,0.4857142857142857 diff --git a/results/cultural_reasoning/few_shot/ph_eval.csv b/results/cultural_reasoning/few_shot/ph_eval.csv index d9935ad2b8bc6a05dbb2462797fed42c4b22fe35..42982652bbed1993eeb121fd25ed78245e8f26db 100644 --- a/results/cultural_reasoning/few_shot/ph_eval.csv +++ b/results/cultural_reasoning/few_shot/ph_eval.csv @@ -1,2 +1,4 @@ Model,Accuracy +Meta-Llama-3-70B,0.68 Meta-Llama-3-8B,0.54 +Meta-Llama-3.1-8B,0.51 diff --git a/results/cultural_reasoning/few_shot/sg_eval.csv b/results/cultural_reasoning/few_shot/sg_eval.csv index 05e2fd4bcd810817496f13cf6e42900a5154661d..a27e30f4ddf50d3723f34ca1cea815a7d360dcf6 100644 --- a/results/cultural_reasoning/few_shot/sg_eval.csv +++ b/results/cultural_reasoning/few_shot/sg_eval.csv @@ -1,3 +1,4 @@ Model,Accuracy Meta-Llama-3-70B,0.7572815533980582 Meta-Llama-3-8B,0.6407766990291263 +Meta-Llama-3.1-8B,0.6116504854368932 diff --git a/results/cultural_reasoning/few_shot/us_eval.csv b/results/cultural_reasoning/few_shot/us_eval.csv index a464de78937fcb4cab139b625cd1e1ab52450078..436de8a011cf0af44d167135be3a62ce832083dc 100644 --- a/results/cultural_reasoning/few_shot/us_eval.csv +++ b/results/cultural_reasoning/few_shot/us_eval.csv @@ -1,2 +1,4 @@ Model,Accuracy +Meta-Llama-3-70B,0.8785046728971962 Meta-Llama-3-8B,0.6915887850467289 +Meta-Llama-3.1-8B,0.6728971962616822 diff --git a/results/cultural_reasoning/zero_shot/cn_eval.csv b/results/cultural_reasoning/zero_shot/cn_eval.csv index 23d5ebc3574ee9033938c0436bdf4ccd6db917b1..502850959810bacdeeb188b9c1a0b52f36c7f6e9 100644 --- a/results/cultural_reasoning/zero_shot/cn_eval.csv +++ b/results/cultural_reasoning/zero_shot/cn_eval.csv @@ -1,5 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.8095238095238095 +Meta-Llama-3.1-8B-Instruct,0.42857142857142855 Qwen2-72B-Instruct,0.8571428571428571 Meta-Llama-3-8B-Instruct,0.37142857142857144 Meta-Llama-3-70B-Instruct,0.5142857142857142 diff --git a/results/cultural_reasoning/zero_shot/ph_eval.csv b/results/cultural_reasoning/zero_shot/ph_eval.csv index 4b64dcef17e18ce8428819ba569460a5df1ebe58..35928cca2b5c82e8d2886191dfb08710954c2334 100644 --- a/results/cultural_reasoning/zero_shot/ph_eval.csv +++ b/results/cultural_reasoning/zero_shot/ph_eval.csv @@ -1,5 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.51 +Meta-Llama-3.1-8B-Instruct,0.56 Qwen2-72B-Instruct,0.63 Meta-Llama-3-8B-Instruct,0.54 Meta-Llama-3-70B-Instruct,0.63 diff --git a/results/cultural_reasoning/zero_shot/sg_eval.csv b/results/cultural_reasoning/zero_shot/sg_eval.csv index 07e4c7a974256a9b60b0dd038ba0e49117ea212a..240e3394a6a0383a55ae099edfccfab5476458f0 100644 --- a/results/cultural_reasoning/zero_shot/sg_eval.csv +++ b/results/cultural_reasoning/zero_shot/sg_eval.csv @@ -1,5 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.6699029126213593 +Meta-Llama-3.1-8B-Instruct,0.6019417475728155 Qwen2-72B-Instruct,0.7378640776699029 Meta-Llama-3-8B-Instruct,0.5922330097087378 Meta-Llama-3-70B-Instruct,0.7184466019417476 diff --git a/results/cultural_reasoning/zero_shot/us_eval.csv b/results/cultural_reasoning/zero_shot/us_eval.csv index 86051361b9c2259b907dd153290f13f52503b1ef..42d9f7490b7036ade93837a9e430714bd9b8f76e 100644 --- a/results/cultural_reasoning/zero_shot/us_eval.csv +++ b/results/cultural_reasoning/zero_shot/us_eval.csv @@ -1,5 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.719626168224299 +Meta-Llama-3.1-8B-Instruct,0.6448598130841121 Qwen2-72B-Instruct,0.8504672897196262 Meta-Llama-3-8B-Instruct,0.6448598130841121 Meta-Llama-3-70B-Instruct,0.8691588785046729 diff --git a/results/dialogue/few_shot/dream.csv b/results/dialogue/few_shot/dream.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..4c4bcf39bba15f4246d290adb75332b64cc48a3b 100644 --- a/results/dialogue/few_shot/dream.csv +++ b/results/dialogue/few_shot/dream.csv @@ -1 +1,3 @@ Model,Accuracy +Meta-Llama-3-8B,0.8250857422831945 +Meta-Llama-3.1-8B,0.8530132288094071 diff --git a/results/dialogue/zero_shot/dialogsum.csv b/results/dialogue/zero_shot/dialogsum.csv index 060f6a61affdc541783817639f2ad160cb26cf1b..1ee43b59821c938bb1ec4273faa1ddecb72584ec 100644 --- a/results/dialogue/zero_shot/dialogsum.csv +++ b/results/dialogue/zero_shot/dialogsum.csv @@ -1,4 +1,6 @@ Model,Average,ROUGE-1,ROUGE-2,ROUGE-L Qwen2-7B-Instruct,0.20907406151501814,0.3054588156947843,0.09317750879187732,0.22858586005839285 +Meta-Llama-3.1-8B-Instruct,0.25775524210830225,0.361264483769506,0.1319601664036931,0.28004107615170776 +Qwen2-72B-Instruct,0.21903635116217549,0.31670807543803475,0.10250931612356096,0.23789166192493072 Meta-Llama-3-8B-Instruct,0.23748034560689027,0.33656243928704743,0.11826169056076426,0.2576169069728591 Meta-Llama-3-70B-Instruct,0.2557065499979308,0.36058417323628,0.12758087337786866,0.2789546033796438 diff --git a/results/dialogue/zero_shot/dream.csv b/results/dialogue/zero_shot/dream.csv index c12fed5b208f662dcd60e50cc34d6ebaacca2157..375a5307d56b17adc14a80dfc365df5afec66090 100644 --- a/results/dialogue/zero_shot/dream.csv +++ b/results/dialogue/zero_shot/dream.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.9338559529642332 +Meta-Llama-3.1-8B-Instruct,0.8858402743753062 +Qwen2-72B-Instruct,0.9608035276825085 Meta-Llama-3-8B-Instruct,0.5433610975012249 Meta-Llama-3-70B-Instruct,0.9480646741793238 diff --git a/results/dialogue/zero_shot/samsum.csv b/results/dialogue/zero_shot/samsum.csv index c6f8f401d1e68d13ca64d56281202573a0378bc4..14c69468062e0b15b1d4f2c20f04bd657498f174 100644 --- a/results/dialogue/zero_shot/samsum.csv +++ b/results/dialogue/zero_shot/samsum.csv @@ -1,4 +1,6 @@ Model,Average,ROUGE-1,ROUGE-2,ROUGE-L Qwen2-7B-Instruct,0.2609036529701212,0.36802926348230236,0.1319027531874975,0.28277894224056366 +Meta-Llama-3.1-8B-Instruct,0.3002534894623792,0.41234119292969856,0.16596515741670248,0.3224541180407366 +Qwen2-72B-Instruct,0.27953180135225114,0.3883786925058577,0.15246657328712612,0.2977501382637696 Meta-Llama-3-8B-Instruct,0.2850232460296334,0.3945214081577773,0.15619034353394273,0.3043579863971803 Meta-Llama-3-70B-Instruct,0.2893525314227379,0.4030746211134018,0.15236139065578,0.3126215824990321 diff --git a/results/emotion/few_shot/ind_emotion.csv b/results/emotion/few_shot/ind_emotion.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..44411de9d0ff02e4439efcc07851766848d5700c 100644 --- a/results/emotion/few_shot/ind_emotion.csv +++ b/results/emotion/few_shot/ind_emotion.csv @@ -1 +1,3 @@ Model,Accuracy +Meta-Llama-3-8B,0.4636363636363636 +Meta-Llama-3.1-8B,0.5136363636363637 diff --git a/results/emotion/few_shot/sst2.csv b/results/emotion/few_shot/sst2.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..213ef049d359f52bb53d0ea5f2e9f577bd998b14 100644 --- a/results/emotion/few_shot/sst2.csv +++ b/results/emotion/few_shot/sst2.csv @@ -1 +1,3 @@ Model,Accuracy +Meta-Llama-3-8B,0.6697247706422018 +Meta-Llama-3.1-8B,0.8405963302752294 diff --git a/results/emotion/zero_shot/ind_emotion.csv b/results/emotion/zero_shot/ind_emotion.csv index 704023cc83184a72a07a429b3cc789bf30c96764..5ea412f8ca88a2523e1b6c6ffdc10d2bce4829e0 100644 --- a/results/emotion/zero_shot/ind_emotion.csv +++ b/results/emotion/zero_shot/ind_emotion.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.6386363636363637 +Meta-Llama-3.1-8B-Instruct,0.6295454545454545 +Qwen2-72B-Instruct,0.675 Meta-Llama-3-8B-Instruct,0.6522727272727272 Meta-Llama-3-70B-Instruct,0.6909090909090909 diff --git a/results/emotion/zero_shot/sst2.csv b/results/emotion/zero_shot/sst2.csv index 5c9a67d7e01c440fb9f21fe79282683a2b4c7ab0..c86c06a777d4051adfc67f7fd01a1a720758c168 100644 --- a/results/emotion/zero_shot/sst2.csv +++ b/results/emotion/zero_shot/sst2.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.9231651376146789 +Meta-Llama-3.1-8B-Instruct,0.8784403669724771 +Qwen2-72B-Instruct,0.9392201834862385 Meta-Llama-3-8B-Instruct,0.8669724770642202 Meta-Llama-3-70B-Instruct,0.9495412844036697 diff --git a/results/flores_translation/few_shot/ind2eng.csv b/results/flores_translation/few_shot/ind2eng.csv index 7f534e8cdc32663e57e36872fe0572ac0a3a2411..8d3c83e3c7fd94860eb317c425f7ced94edb956a 100644 --- a/results/flores_translation/few_shot/ind2eng.csv +++ b/results/flores_translation/few_shot/ind2eng.csv @@ -1,2 +1,4 @@ Model,BLEU +Meta-Llama-3-70B,0.4224655367668861 Meta-Llama-3-8B,0.37760317005449096 +Meta-Llama-3.1-8B,0.384092499597103 diff --git a/results/flores_translation/few_shot/vie2eng.csv b/results/flores_translation/few_shot/vie2eng.csv index 56764548e5126ccb0bc1ae02ee1ec8aecd0546ac..8610cab03a2a324d12cc7e1f9c6cfd935ae211e9 100644 --- a/results/flores_translation/few_shot/vie2eng.csv +++ b/results/flores_translation/few_shot/vie2eng.csv @@ -1,2 +1,4 @@ Model,BLEU +Meta-Llama-3-70B,0.3564689224836266 Meta-Llama-3-8B,0.31157996445764863 +Meta-Llama-3.1-8B,0.320367356810332 diff --git a/results/flores_translation/few_shot/zho2eng.csv b/results/flores_translation/few_shot/zho2eng.csv index ca98b0289c01e5d0e609d239b20ed788216c429f..3bb9a71681662027668533a92d1b63f9f3a7c2bc 100644 --- a/results/flores_translation/few_shot/zho2eng.csv +++ b/results/flores_translation/few_shot/zho2eng.csv @@ -1,2 +1,4 @@ Model,BLEU +Meta-Llama-3-70B,0.27798501796196434 Meta-Llama-3-8B,0.23710858530408072 +Meta-Llama-3.1-8B,0.23777256698409086 diff --git a/results/flores_translation/few_shot/zsm2eng.csv b/results/flores_translation/few_shot/zsm2eng.csv index b560efa78538383d03231ff2577b702a9eb9c3f8..0bd4e20cfc4a099589a86788af6caf95856736b1 100644 --- a/results/flores_translation/few_shot/zsm2eng.csv +++ b/results/flores_translation/few_shot/zsm2eng.csv @@ -1,2 +1,4 @@ Model,BLEU +Meta-Llama-3-70B,0.44357168236218214 Meta-Llama-3-8B,0.3908770132718593 +Meta-Llama-3.1-8B,0.3893813156403672 diff --git a/results/flores_translation/zero_shot/ind2eng.csv b/results/flores_translation/zero_shot/ind2eng.csv index de375b52ab4c145693acec0d7ed14d1b219facd0..2e51da6abb8a415f4e49459f235935824e298e4e 100644 --- a/results/flores_translation/zero_shot/ind2eng.csv +++ b/results/flores_translation/zero_shot/ind2eng.csv @@ -1,4 +1,6 @@ Model,BLEU Qwen2-7B-Instruct,0.2968667083646938 +Meta-Llama-3.1-8B-Instruct,0.3851478947359834 +Qwen2-72B-Instruct,0.40378146176265345 Meta-Llama-3-8B-Instruct,0.33011728860318257 Meta-Llama-3-70B-Instruct,0.3830092775167675 diff --git a/results/flores_translation/zero_shot/vie2eng.csv b/results/flores_translation/zero_shot/vie2eng.csv index ffc10e5cbab87e1ffe9a28ead5340d2249b5a667..a6b7e5b5b6a3269e82f80a56cc50e6a3472ac4cc 100644 --- a/results/flores_translation/zero_shot/vie2eng.csv +++ b/results/flores_translation/zero_shot/vie2eng.csv @@ -1,4 +1,6 @@ Model,BLEU Qwen2-7B-Instruct,0.23571859325121644 +Meta-Llama-3.1-8B-Instruct,0.3229889780558947 +Qwen2-72B-Instruct,0.3326034551014482 Meta-Llama-3-8B-Instruct,0.2637063711923046 Meta-Llama-3-70B-Instruct,0.3230140263371192 diff --git a/results/flores_translation/zero_shot/zho2eng.csv b/results/flores_translation/zero_shot/zho2eng.csv index 8e2fe04f143cbbaad6c5212a6dc3097fc3f39739..76dd6fca014b8e3632690fb1a81c9ffa1cd6a88c 100644 --- a/results/flores_translation/zero_shot/zho2eng.csv +++ b/results/flores_translation/zero_shot/zho2eng.csv @@ -1,4 +1,6 @@ Model,BLEU Qwen2-7B-Instruct,0.21747115262398484 +Meta-Llama-3.1-8B-Instruct,0.24469097639356438 +Qwen2-72B-Instruct,0.24317967002278634 Meta-Llama-3-8B-Instruct,0.19960072119079214 Meta-Llama-3-70B-Instruct,0.24397819518058994 diff --git a/results/flores_translation/zero_shot/zsm2eng.csv b/results/flores_translation/zero_shot/zsm2eng.csv index d8e006fdb7ae46f7cfd3aa8a1f8cf8264589703b..ebbd5d2a5df5ca945b02cd378280cb87eede3069 100644 --- a/results/flores_translation/zero_shot/zsm2eng.csv +++ b/results/flores_translation/zero_shot/zsm2eng.csv @@ -1,4 +1,6 @@ Model,BLEU Qwen2-7B-Instruct,0.27198336767927184 +Meta-Llama-3.1-8B-Instruct,0.3833985449157327 +Qwen2-72B-Instruct,0.40613262295280417 Meta-Llama-3-8B-Instruct,0.31536374302282033 Meta-Llama-3-70B-Instruct,0.3957287030176054 diff --git a/results/fundamental_nlp_tasks/few_shot/c3.csv b/results/fundamental_nlp_tasks/few_shot/c3.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..ac6ab69c6924cd506921d904ba00ecf331ce0d7c 100644 --- a/results/fundamental_nlp_tasks/few_shot/c3.csv +++ b/results/fundamental_nlp_tasks/few_shot/c3.csv @@ -1 +1,3 @@ Model,Accuracy +Meta-Llama-3-8B,0.7703814510097232 +Meta-Llama-3.1-8B,0.8208676140613314 diff --git a/results/fundamental_nlp_tasks/few_shot/cola.csv b/results/fundamental_nlp_tasks/few_shot/cola.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..2919083f58eff6b24b2f521ee33e9cd7a54fb961 100644 --- a/results/fundamental_nlp_tasks/few_shot/cola.csv +++ b/results/fundamental_nlp_tasks/few_shot/cola.csv @@ -1 +1,3 @@ Model,Accuracy +Meta-Llama-3-8B,0.6596356663470757 +Meta-Llama-3.1-8B,0.6222435282837967 diff --git a/results/fundamental_nlp_tasks/few_shot/mnli.csv b/results/fundamental_nlp_tasks/few_shot/mnli.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..0e6db01fa62fd15c9036936c9924db4d51441f6a 100644 --- a/results/fundamental_nlp_tasks/few_shot/mnli.csv +++ b/results/fundamental_nlp_tasks/few_shot/mnli.csv @@ -1 +1,2 @@ Model,Accuracy +Meta-Llama-3-8B,0.46174988547869905 diff --git a/results/fundamental_nlp_tasks/few_shot/mrpc.csv b/results/fundamental_nlp_tasks/few_shot/mrpc.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..827c2a4f17120ac66a4a401ab0503bcfc0509ee9 100644 --- a/results/fundamental_nlp_tasks/few_shot/mrpc.csv +++ b/results/fundamental_nlp_tasks/few_shot/mrpc.csv @@ -1 +1,2 @@ Model,Accuracy +Meta-Llama-3-8B,0.5906862745098039 diff --git a/results/fundamental_nlp_tasks/few_shot/ocnli.csv b/results/fundamental_nlp_tasks/few_shot/ocnli.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..431786e6d82afb54487945f98835be478c2ed4b9 100644 --- a/results/fundamental_nlp_tasks/few_shot/ocnli.csv +++ b/results/fundamental_nlp_tasks/few_shot/ocnli.csv @@ -1 +1,3 @@ Model,Accuracy +Meta-Llama-3-8B,0.3935593220338983 +Meta-Llama-3.1-8B,0.411864406779661 diff --git a/results/fundamental_nlp_tasks/few_shot/qnli.csv b/results/fundamental_nlp_tasks/few_shot/qnli.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..c0d88dd0d1d5560c1a255c7b660b67b32bb19c94 100644 --- a/results/fundamental_nlp_tasks/few_shot/qnli.csv +++ b/results/fundamental_nlp_tasks/few_shot/qnli.csv @@ -1 +1,2 @@ Model,Accuracy +Meta-Llama-3-8B,0.5059491122094087 diff --git a/results/fundamental_nlp_tasks/few_shot/qqp.csv b/results/fundamental_nlp_tasks/few_shot/qqp.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..3fa79a9fa8e34aa75e0b73c9b0f1fee46b624926 100644 --- a/results/fundamental_nlp_tasks/few_shot/qqp.csv +++ b/results/fundamental_nlp_tasks/few_shot/qqp.csv @@ -1 +1,2 @@ Model,Accuracy +Meta-Llama-3-8B,0.551 diff --git a/results/fundamental_nlp_tasks/few_shot/rte.csv b/results/fundamental_nlp_tasks/few_shot/rte.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..a0515adde4caa19452cd45bcd284ddc40f359306 100644 --- a/results/fundamental_nlp_tasks/few_shot/rte.csv +++ b/results/fundamental_nlp_tasks/few_shot/rte.csv @@ -1 +1,2 @@ Model,Accuracy +Meta-Llama-3-8B,0.5487364620938628 diff --git a/results/fundamental_nlp_tasks/few_shot/wnli.csv b/results/fundamental_nlp_tasks/few_shot/wnli.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..c45edabed316b93f43b6da88ee0a56d880ae43c2 100644 --- a/results/fundamental_nlp_tasks/few_shot/wnli.csv +++ b/results/fundamental_nlp_tasks/few_shot/wnli.csv @@ -1 +1,2 @@ Model,Accuracy +Meta-Llama-3-8B,0.4647887323943662 diff --git a/results/fundamental_nlp_tasks/zero_shot/c3.csv b/results/fundamental_nlp_tasks/zero_shot/c3.csv index be600fb4de242c35c44421e82f442425f3d40d9a..7bc3f7b76ed3514ac2d5fe43681cb33f9c190ec0 100644 --- a/results/fundamental_nlp_tasks/zero_shot/c3.csv +++ b/results/fundamental_nlp_tasks/zero_shot/c3.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.9233358264771877 +Meta-Llama-3.1-8B-Instruct,0.7984293193717278 +Qwen2-72B-Instruct,0.9599850411368736 Meta-Llama-3-8B-Instruct,0.8515332834704562 Meta-Llama-3-70B-Instruct,0.9521316379955124 diff --git a/results/fundamental_nlp_tasks/zero_shot/cola.csv b/results/fundamental_nlp_tasks/zero_shot/cola.csv index 093f0edfd88f87334431c3f1a063ada79b2c3f9f..5d8056404626d0edf3f18ffb33d66ecebb541979 100644 --- a/results/fundamental_nlp_tasks/zero_shot/cola.csv +++ b/results/fundamental_nlp_tasks/zero_shot/cola.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.7861936720997124 +Meta-Llama-3.1-8B-Instruct,0.7046979865771812 +Qwen2-72B-Instruct,0.8360498561840843 Meta-Llama-3-8B-Instruct,0.6481303930968361 Meta-Llama-3-70B-Instruct,0.835091083413231 diff --git a/results/fundamental_nlp_tasks/zero_shot/mnli.csv b/results/fundamental_nlp_tasks/zero_shot/mnli.csv index 55c4a1d8153d5674ee360384e5a623ff02b8ad5e..267c2466a5c6ae0a60d97f163402edc7999e7693 100644 --- a/results/fundamental_nlp_tasks/zero_shot/mnli.csv +++ b/results/fundamental_nlp_tasks/zero_shot/mnli.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.7341578867002596 +Meta-Llama-3.1-8B-Instruct,0.4603756298671553 +Qwen2-72B-Instruct,0.7979335267470861 Meta-Llama-3-8B-Instruct,0.5296991907161399 Meta-Llama-3-70B-Instruct,0.6709421285692472 diff --git a/results/fundamental_nlp_tasks/zero_shot/mrpc.csv b/results/fundamental_nlp_tasks/zero_shot/mrpc.csv index bf561efd8a4b1d91de2f4b4fce3bea9f3bbc422f..5cd6e28a6fa2d0eb45c398602763c464358ce254 100644 --- a/results/fundamental_nlp_tasks/zero_shot/mrpc.csv +++ b/results/fundamental_nlp_tasks/zero_shot/mrpc.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.7745098039215687 +Meta-Llama-3.1-8B-Instruct,0.6740196078431373 +Qwen2-72B-Instruct,0.7941176470588235 Meta-Llama-3-8B-Instruct,0.6764705882352942 Meta-Llama-3-70B-Instruct,0.7598039215686274 diff --git a/results/fundamental_nlp_tasks/zero_shot/ocnli.csv b/results/fundamental_nlp_tasks/zero_shot/ocnli.csv index faa8c4ea976d028167f54f1c6f80f6733c9526a9..b609d3c48993738b792cd0f75ca7513d8879a9a1 100644 --- a/results/fundamental_nlp_tasks/zero_shot/ocnli.csv +++ b/results/fundamental_nlp_tasks/zero_shot/ocnli.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.6474576271186441 +Meta-Llama-3.1-8B-Instruct,0.42135593220338985 +Qwen2-72B-Instruct,0.7874576271186441 Meta-Llama-3-8B-Instruct,0.4322033898305085 Meta-Llama-3-70B-Instruct,0.5928813559322034 diff --git a/results/fundamental_nlp_tasks/zero_shot/qnli.csv b/results/fundamental_nlp_tasks/zero_shot/qnli.csv index b7dfb5988db2ba12269801a8790fea6432973961..3e04ad5519eb082b9c768d5af448770e08205b78 100644 --- a/results/fundamental_nlp_tasks/zero_shot/qnli.csv +++ b/results/fundamental_nlp_tasks/zero_shot/qnli.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.8169503935566539 +Meta-Llama-3.1-8B-Instruct,0.6027823540179389 +Qwen2-72B-Instruct,0.8894380377082189 Meta-Llama-3-8B-Instruct,0.5689181768259198 Meta-Llama-3-70B-Instruct,0.876807614863628 diff --git a/results/fundamental_nlp_tasks/zero_shot/qqp.csv b/results/fundamental_nlp_tasks/zero_shot/qqp.csv index bbf0ac307ff93e929802c21e18e1ae13a08c2f36..38dc755aafa6bc5eb1af7387788e2d64d9c891b3 100644 --- a/results/fundamental_nlp_tasks/zero_shot/qqp.csv +++ b/results/fundamental_nlp_tasks/zero_shot/qqp.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.7771209497897601 +Meta-Llama-3.1-8B-Instruct,0.5058125154588177 +Qwen2-72B-Instruct,0.7992332426416028 Meta-Llama-3-8B-Instruct,0.5512490724709375 Meta-Llama-3-70B-Instruct,0.7876082117239673 diff --git a/results/fundamental_nlp_tasks/zero_shot/rte.csv b/results/fundamental_nlp_tasks/zero_shot/rte.csv index b247ab7191c17cca09857286e2712222ee17e653..35bf1a4fcb540e661c5616815076be8baef7e50f 100644 --- a/results/fundamental_nlp_tasks/zero_shot/rte.csv +++ b/results/fundamental_nlp_tasks/zero_shot/rte.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.8411552346570397 +Meta-Llama-3.1-8B-Instruct,0.6895306859205776 +Qwen2-72B-Instruct,0.8592057761732852 Meta-Llama-3-8B-Instruct,0.6028880866425993 Meta-Llama-3-70B-Instruct,0.8086642599277978 diff --git a/results/fundamental_nlp_tasks/zero_shot/wnli.csv b/results/fundamental_nlp_tasks/zero_shot/wnli.csv index 879b9f67d19fd37dc515629a8e7e47d8d750742d..9413f31ff5d1c77105c91e5f78cc27effc4333f2 100644 --- a/results/fundamental_nlp_tasks/zero_shot/wnli.csv +++ b/results/fundamental_nlp_tasks/zero_shot/wnli.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.647887323943662 +Meta-Llama-3.1-8B-Instruct,0.4507042253521127 +Qwen2-72B-Instruct,0.9014084507042254 Meta-Llama-3-8B-Instruct,0.4507042253521127 Meta-Llama-3-70B-Instruct,0.7887323943661971 diff --git a/results/general_reasoning/few_shot/c_eval.csv b/results/general_reasoning/few_shot/c_eval.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..e6be32891eca36d9b494be10a917d645910fb2ef 100644 --- a/results/general_reasoning/few_shot/c_eval.csv +++ b/results/general_reasoning/few_shot/c_eval.csv @@ -1 +1,4 @@ Model,Accuracy +Meta-Llama-3-70B,0.6257783312577833 +Meta-Llama-3-8B,0.43773349937733497 +Meta-Llama-3.1-8B,0.44458281444582815 diff --git a/results/general_reasoning/few_shot/cmmlu.csv b/results/general_reasoning/few_shot/cmmlu.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..578071253c1cba9da5da2c6779238a5424b4d4fd 100644 --- a/results/general_reasoning/few_shot/cmmlu.csv +++ b/results/general_reasoning/few_shot/cmmlu.csv @@ -1 +1,3 @@ Model,Accuracy +Meta-Llama-3-8B,0.4308409601105163 +Meta-Llama-3.1-8B,0.4555344500086341 diff --git a/results/general_reasoning/few_shot/indommlu.csv b/results/general_reasoning/few_shot/indommlu.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..d881ebd24fba166232400ede7d7a6059fff7872a 100644 --- a/results/general_reasoning/few_shot/indommlu.csv +++ b/results/general_reasoning/few_shot/indommlu.csv @@ -1 +1,3 @@ Model,Accuracy +Meta-Llama-3-8B,0.4500300420588824 +Meta-Llama-3.1-8B,0.4643834701916016 diff --git a/results/general_reasoning/few_shot/mmlu.csv b/results/general_reasoning/few_shot/mmlu.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..aef6cc8c6f1e0a61be2c06a2bd9496648c19289f 100644 --- a/results/general_reasoning/few_shot/mmlu.csv +++ b/results/general_reasoning/few_shot/mmlu.csv @@ -1 +1,3 @@ Model,Accuracy +Meta-Llama-3-8B,0.5651054701465856 +Meta-Llama-3.1-8B,0.5749731855559528 diff --git a/results/general_reasoning/few_shot/zbench.csv b/results/general_reasoning/few_shot/zbench.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..cc1232ff5dc5032cb557076d956c2416b1ee6a46 100644 --- a/results/general_reasoning/few_shot/zbench.csv +++ b/results/general_reasoning/few_shot/zbench.csv @@ -1 +1,3 @@ Model,Accuracy +Meta-Llama-3-8B,0.2727272727272727 +Meta-Llama-3.1-8B,0.3939393939393939 diff --git a/results/general_reasoning/zero_shot/c_eval.csv b/results/general_reasoning/zero_shot/c_eval.csv index d08930fb8dbd3349041f76ccde23044c8304b3c6..e25f74024a5ec9c9d9548b9741d05d1237ea7654 100644 --- a/results/general_reasoning/zero_shot/c_eval.csv +++ b/results/general_reasoning/zero_shot/c_eval.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.7546699875466999 +Meta-Llama-3.1-8B-Instruct,0.3493150684931507 +Qwen2-72B-Instruct,0.823785803237858 Meta-Llama-3-8B-Instruct,0.4533001245330012 Meta-Llama-3-70B-Instruct,0.6046077210460772 diff --git a/results/general_reasoning/zero_shot/cmmlu.csv b/results/general_reasoning/zero_shot/cmmlu.csv index afbe25d4d83259f4b2b002e37673ed21711abeeb..9bb4f75790b633e5607c6e6d1caff7f215c0b7b6 100644 --- a/results/general_reasoning/zero_shot/cmmlu.csv +++ b/results/general_reasoning/zero_shot/cmmlu.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.7656708685891901 +Meta-Llama-3.1-8B-Instruct,0.38240372992574684 +Qwen2-72B-Instruct,0.8240372992574685 Meta-Llama-3-8B-Instruct,0.4679675358314626 Meta-Llama-3-70B-Instruct,0.6195821101709549 diff --git a/results/general_reasoning/zero_shot/indommlu.csv b/results/general_reasoning/zero_shot/indommlu.csv index 2b9d296121c9db1fd5c9d64a045dd639e6a471de..85859de27afdff51069baf79059175b6baaec7b1 100644 --- a/results/general_reasoning/zero_shot/indommlu.csv +++ b/results/general_reasoning/zero_shot/indommlu.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.53027571934041 +Meta-Llama-3.1-8B-Instruct,0.4701916015755391 +Qwen2-72B-Instruct,0.6356232058214835 Meta-Llama-3-8B-Instruct,0.5115161225716003 Meta-Llama-3-70B-Instruct,0.6323519594098405 diff --git a/results/general_reasoning/zero_shot/mmlu.csv b/results/general_reasoning/zero_shot/mmlu.csv index c9624f74c0176cad41871491016275e5ba2035e4..4bab95a37e9d017e663f4c3134fd168020d04927 100644 --- a/results/general_reasoning/zero_shot/mmlu.csv +++ b/results/general_reasoning/zero_shot/mmlu.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.6654272434751519 +Meta-Llama-3.1-8B-Instruct,0.5518770110833036 +Qwen2-72B-Instruct,0.7935645334286736 Meta-Llama-3-8B-Instruct,0.508044333214158 Meta-Llama-3-70B-Instruct,0.7607436539149088 diff --git a/results/general_reasoning/zero_shot/zbench.csv b/results/general_reasoning/zero_shot/zbench.csv index 1e0718ab1a40555aea72e40648613d13d9ce1360..ab679616e3cb0395ca8be3a917e38a19739176cb 100644 --- a/results/general_reasoning/zero_shot/zbench.csv +++ b/results/general_reasoning/zero_shot/zbench.csv @@ -1,4 +1,6 @@ Model,Accuracy Qwen2-7B-Instruct,0.696969696969697 +Meta-Llama-3.1-8B-Instruct,0.45454545454545453 +Qwen2-72B-Instruct,0.5757575757575758 Meta-Llama-3-8B-Instruct,0.30303030303030304 Meta-Llama-3-70B-Instruct,0.45454545454545453