Spaces:
Running
Running
new
Browse files
app.py
CHANGED
@@ -9,10 +9,8 @@ from statistics import median
|
|
9 |
|
10 |
print("Loading datasets...")
|
11 |
|
12 |
-
|
13 |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
14 |
|
15 |
-
|
16 |
def add_rank(df, compute_average=True):
|
17 |
cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Params)", "Embedding Dimensions", "Sequence Length"]]
|
18 |
if len(cols_to_rank) == 1:
|
@@ -78,7 +76,6 @@ def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
|
|
78 |
AC3_3 = median(AC3_3)
|
79 |
|
80 |
except:
|
81 |
-
print(results_list)
|
82 |
consistency_score_3 = -1
|
83 |
overall_acc = -1
|
84 |
AC3_3 = -1
|
@@ -146,7 +143,6 @@ def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True):
|
|
146 |
|
147 |
|
148 |
except:
|
149 |
-
print(results_list)
|
150 |
English = -1
|
151 |
Vietnamese = -1
|
152 |
Chinese = -1
|
@@ -219,7 +215,6 @@ def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True)
|
|
219 |
AC3_3 = median(AC3_3)
|
220 |
|
221 |
except:
|
222 |
-
print(results_list)
|
223 |
consistency_score_3 = -1
|
224 |
overall_acc = -1
|
225 |
AC3_3 = -1
|
@@ -287,7 +282,6 @@ def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True
|
|
287 |
|
288 |
|
289 |
except:
|
290 |
-
print(results_list)
|
291 |
English = -1
|
292 |
Vietnamese = -1
|
293 |
Chinese = -1
|
@@ -351,7 +345,6 @@ def get_data_sg_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
351 |
accuracy = median([results['accuracy'] for results in results_list])
|
352 |
|
353 |
except:
|
354 |
-
print(results_list)
|
355 |
accuracy = -1
|
356 |
|
357 |
res = {
|
@@ -404,7 +397,6 @@ def get_data_us_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
404 |
accuracy = median([results['accuracy'] for results in results_list])
|
405 |
|
406 |
except:
|
407 |
-
print(results_list)
|
408 |
accuracy = -1
|
409 |
|
410 |
|
@@ -458,7 +450,6 @@ def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
458 |
accuracy = median([results['accuracy'] for results in results_list])
|
459 |
|
460 |
except:
|
461 |
-
print(results_list)
|
462 |
accuracy = -1
|
463 |
|
464 |
|
@@ -512,7 +503,6 @@ def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
512 |
accuracy = median([results['accuracy'] for results in results_list])
|
513 |
|
514 |
except:
|
515 |
-
print(results_list)
|
516 |
accuracy = -1
|
517 |
|
518 |
|
@@ -566,7 +556,6 @@ def get_data_sing2eng(eval_mode='zero_shot', fillna=True, rank=True):
|
|
566 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
567 |
|
568 |
except:
|
569 |
-
print(results_list)
|
570 |
bleu_score = -1
|
571 |
|
572 |
|
@@ -619,7 +608,6 @@ def get_data_flores_ind2eng(eval_mode='zero_shot', fillna=True, rank=True):
|
|
619 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
620 |
|
621 |
except:
|
622 |
-
print(results_list)
|
623 |
bleu_score = -1
|
624 |
|
625 |
|
@@ -674,7 +662,6 @@ def get_data_flores_vie2eng(eval_mode='zero_shot', fillna=True, rank=True):
|
|
674 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
675 |
|
676 |
except:
|
677 |
-
print(results_list)
|
678 |
bleu_score = -1
|
679 |
|
680 |
|
@@ -727,7 +714,6 @@ def get_data_flores_zho2eng(eval_mode='zero_shot', fillna=True, rank=True):
|
|
727 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
728 |
|
729 |
except:
|
730 |
-
print(results_list)
|
731 |
bleu_score = -1
|
732 |
|
733 |
|
@@ -781,7 +767,6 @@ def get_data_flores_zsm2eng(eval_mode='zero_shot', fillna=True, rank=True):
|
|
781 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
782 |
|
783 |
except:
|
784 |
-
print(results_list)
|
785 |
bleu_score = -1
|
786 |
|
787 |
|
@@ -835,7 +820,6 @@ def get_data_mmlu(eval_mode='zero_shot', fillna=True, rank=True):
|
|
835 |
accuracy = median([results['accuracy'] for results in results_list])
|
836 |
|
837 |
except:
|
838 |
-
print(results_list)
|
839 |
accuracy = -1
|
840 |
|
841 |
|
@@ -890,7 +874,6 @@ def get_data_mmlu_full(eval_mode='zero_shot', fillna=True, rank=True):
|
|
890 |
accuracy = median([results['accuracy'] for results in results_list])
|
891 |
|
892 |
except:
|
893 |
-
print(results_list)
|
894 |
accuracy = -1
|
895 |
|
896 |
|
@@ -944,7 +927,6 @@ def get_data_c_eval(eval_mode='zero_shot', fillna=True, rank=True):
|
|
944 |
accuracy = median([results['accuracy'] for results in results_list])
|
945 |
|
946 |
except:
|
947 |
-
print(results_list)
|
948 |
accuracy = -1
|
949 |
|
950 |
|
@@ -998,7 +980,6 @@ def get_data_c_eval_full(eval_mode='zero_shot', fillna=True, rank=True):
|
|
998 |
accuracy = median([results['accuracy'] for results in results_list])
|
999 |
|
1000 |
except:
|
1001 |
-
print(results_list)
|
1002 |
accuracy = -1
|
1003 |
|
1004 |
|
@@ -1054,7 +1035,6 @@ def get_data_cmmlu(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1054 |
accuracy = median([results['accuracy'] for results in results_list])
|
1055 |
|
1056 |
except:
|
1057 |
-
print(results_list)
|
1058 |
accuracy = -1
|
1059 |
|
1060 |
|
@@ -1112,7 +1092,6 @@ def get_data_cmmlu_full(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1112 |
accuracy = median([results['accuracy'] for results in results_list])
|
1113 |
|
1114 |
except:
|
1115 |
-
print(results_list)
|
1116 |
accuracy = -1
|
1117 |
|
1118 |
|
@@ -1167,7 +1146,6 @@ def get_data_zbench(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1167 |
accuracy = median([results['accuracy'] for results in results_list])
|
1168 |
|
1169 |
except:
|
1170 |
-
print(results_list)
|
1171 |
accuracy = -1
|
1172 |
|
1173 |
|
@@ -1222,7 +1200,6 @@ def get_data_ind_emotion(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1222 |
accuracy = median([results['accuracy'] for results in results_list])
|
1223 |
|
1224 |
except:
|
1225 |
-
print(results_list)
|
1226 |
accuracy = -1
|
1227 |
|
1228 |
|
@@ -1278,7 +1255,6 @@ def get_data_ocnli(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1278 |
accuracy = median([results['accuracy'] for results in results_list])
|
1279 |
|
1280 |
except:
|
1281 |
-
print(results_list)
|
1282 |
accuracy = -1
|
1283 |
|
1284 |
|
@@ -1333,7 +1309,6 @@ def get_data_c3(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1333 |
accuracy = median([results['accuracy'] for results in results_list])
|
1334 |
|
1335 |
except:
|
1336 |
-
print(results_list)
|
1337 |
accuracy = -1
|
1338 |
|
1339 |
|
@@ -1388,7 +1363,6 @@ def get_data_dream(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1388 |
accuracy = median([results['accuracy'] for results in results_list])
|
1389 |
|
1390 |
except:
|
1391 |
-
print(results_list)
|
1392 |
accuracy = -1
|
1393 |
|
1394 |
|
@@ -1445,7 +1419,6 @@ def get_data_samsum(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1445 |
rougeL = median([results['rougeL'] for results in results_list])
|
1446 |
|
1447 |
except:
|
1448 |
-
print(results_list)
|
1449 |
rouge1 = -1
|
1450 |
rouge2 = -1
|
1451 |
rougeL = -1
|
@@ -1505,7 +1478,6 @@ def get_data_dialogsum(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1505 |
rougeL = median([results['rougeL'] for results in results_list])
|
1506 |
|
1507 |
except:
|
1508 |
-
print(results_list)
|
1509 |
rouge1 = -1
|
1510 |
rouge2 = -1
|
1511 |
rougeL = -1
|
@@ -1565,7 +1537,6 @@ def get_data_sst2(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1565 |
accuracy = median([results['accuracy'] for results in results_list])
|
1566 |
|
1567 |
except:
|
1568 |
-
print(results_list)
|
1569 |
accuracy = -1
|
1570 |
|
1571 |
|
@@ -1621,7 +1592,6 @@ def get_data_cola(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1621 |
accuracy = median([results['accuracy'] for results in results_list])
|
1622 |
|
1623 |
except:
|
1624 |
-
print(results_list)
|
1625 |
accuracy = -1
|
1626 |
|
1627 |
|
@@ -1678,7 +1648,6 @@ def get_data_qqp(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1678 |
accuracy = median([results['accuracy'] for results in results_list])
|
1679 |
|
1680 |
except:
|
1681 |
-
print(results_list)
|
1682 |
accuracy = -1
|
1683 |
|
1684 |
|
@@ -1735,7 +1704,6 @@ def get_data_mnli(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1735 |
accuracy = median([results['accuracy'] for results in results_list])
|
1736 |
|
1737 |
except:
|
1738 |
-
print(results_list)
|
1739 |
accuracy = -1
|
1740 |
|
1741 |
|
@@ -1792,7 +1760,6 @@ def get_data_qnli(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1792 |
accuracy = median([results['accuracy'] for results in results_list])
|
1793 |
|
1794 |
except:
|
1795 |
-
print(results_list)
|
1796 |
accuracy = -1
|
1797 |
|
1798 |
|
@@ -1849,7 +1816,6 @@ def get_data_wnli(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1849 |
accuracy = median([results['accuracy'] for results in results_list])
|
1850 |
|
1851 |
except:
|
1852 |
-
print(results_list)
|
1853 |
accuracy = -1
|
1854 |
|
1855 |
|
@@ -1906,7 +1872,6 @@ def get_data_rte(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1906 |
accuracy = median([results['accuracy'] for results in results_list])
|
1907 |
|
1908 |
except:
|
1909 |
-
print(results_list)
|
1910 |
accuracy = -1
|
1911 |
|
1912 |
|
@@ -1964,7 +1929,6 @@ def get_data_mrpc(eval_mode='zero_shot', fillna=True, rank=True):
|
|
1964 |
accuracy = median([results['accuracy'] for results in results_list])
|
1965 |
|
1966 |
except:
|
1967 |
-
print(results_list)
|
1968 |
accuracy = -1
|
1969 |
|
1970 |
|
@@ -3052,15 +3016,6 @@ with block:
|
|
3052 |
|
3053 |
|
3054 |
|
3055 |
-
|
3056 |
-
|
3057 |
-
|
3058 |
-
|
3059 |
-
|
3060 |
-
|
3061 |
-
|
3062 |
-
|
3063 |
-
|
3064 |
gr.Markdown(r"""
|
3065 |
|
3066 |
If this work is useful to you, please citing our work:
|
|
|
9 |
|
10 |
print("Loading datasets...")
|
11 |
|
|
|
12 |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
13 |
|
|
|
14 |
def add_rank(df, compute_average=True):
|
15 |
cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Params)", "Embedding Dimensions", "Sequence Length"]]
|
16 |
if len(cols_to_rank) == 1:
|
|
|
76 |
AC3_3 = median(AC3_3)
|
77 |
|
78 |
except:
|
|
|
79 |
consistency_score_3 = -1
|
80 |
overall_acc = -1
|
81 |
AC3_3 = -1
|
|
|
143 |
|
144 |
|
145 |
except:
|
|
|
146 |
English = -1
|
147 |
Vietnamese = -1
|
148 |
Chinese = -1
|
|
|
215 |
AC3_3 = median(AC3_3)
|
216 |
|
217 |
except:
|
|
|
218 |
consistency_score_3 = -1
|
219 |
overall_acc = -1
|
220 |
AC3_3 = -1
|
|
|
282 |
|
283 |
|
284 |
except:
|
|
|
285 |
English = -1
|
286 |
Vietnamese = -1
|
287 |
Chinese = -1
|
|
|
345 |
accuracy = median([results['accuracy'] for results in results_list])
|
346 |
|
347 |
except:
|
|
|
348 |
accuracy = -1
|
349 |
|
350 |
res = {
|
|
|
397 |
accuracy = median([results['accuracy'] for results in results_list])
|
398 |
|
399 |
except:
|
|
|
400 |
accuracy = -1
|
401 |
|
402 |
|
|
|
450 |
accuracy = median([results['accuracy'] for results in results_list])
|
451 |
|
452 |
except:
|
|
|
453 |
accuracy = -1
|
454 |
|
455 |
|
|
|
503 |
accuracy = median([results['accuracy'] for results in results_list])
|
504 |
|
505 |
except:
|
|
|
506 |
accuracy = -1
|
507 |
|
508 |
|
|
|
556 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
557 |
|
558 |
except:
|
|
|
559 |
bleu_score = -1
|
560 |
|
561 |
|
|
|
608 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
609 |
|
610 |
except:
|
|
|
611 |
bleu_score = -1
|
612 |
|
613 |
|
|
|
662 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
663 |
|
664 |
except:
|
|
|
665 |
bleu_score = -1
|
666 |
|
667 |
|
|
|
714 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
715 |
|
716 |
except:
|
|
|
717 |
bleu_score = -1
|
718 |
|
719 |
|
|
|
767 |
bleu_score = median([results['bleu_score'] for results in results_list])
|
768 |
|
769 |
except:
|
|
|
770 |
bleu_score = -1
|
771 |
|
772 |
|
|
|
820 |
accuracy = median([results['accuracy'] for results in results_list])
|
821 |
|
822 |
except:
|
|
|
823 |
accuracy = -1
|
824 |
|
825 |
|
|
|
874 |
accuracy = median([results['accuracy'] for results in results_list])
|
875 |
|
876 |
except:
|
|
|
877 |
accuracy = -1
|
878 |
|
879 |
|
|
|
927 |
accuracy = median([results['accuracy'] for results in results_list])
|
928 |
|
929 |
except:
|
|
|
930 |
accuracy = -1
|
931 |
|
932 |
|
|
|
980 |
accuracy = median([results['accuracy'] for results in results_list])
|
981 |
|
982 |
except:
|
|
|
983 |
accuracy = -1
|
984 |
|
985 |
|
|
|
1035 |
accuracy = median([results['accuracy'] for results in results_list])
|
1036 |
|
1037 |
except:
|
|
|
1038 |
accuracy = -1
|
1039 |
|
1040 |
|
|
|
1092 |
accuracy = median([results['accuracy'] for results in results_list])
|
1093 |
|
1094 |
except:
|
|
|
1095 |
accuracy = -1
|
1096 |
|
1097 |
|
|
|
1146 |
accuracy = median([results['accuracy'] for results in results_list])
|
1147 |
|
1148 |
except:
|
|
|
1149 |
accuracy = -1
|
1150 |
|
1151 |
|
|
|
1200 |
accuracy = median([results['accuracy'] for results in results_list])
|
1201 |
|
1202 |
except:
|
|
|
1203 |
accuracy = -1
|
1204 |
|
1205 |
|
|
|
1255 |
accuracy = median([results['accuracy'] for results in results_list])
|
1256 |
|
1257 |
except:
|
|
|
1258 |
accuracy = -1
|
1259 |
|
1260 |
|
|
|
1309 |
accuracy = median([results['accuracy'] for results in results_list])
|
1310 |
|
1311 |
except:
|
|
|
1312 |
accuracy = -1
|
1313 |
|
1314 |
|
|
|
1363 |
accuracy = median([results['accuracy'] for results in results_list])
|
1364 |
|
1365 |
except:
|
|
|
1366 |
accuracy = -1
|
1367 |
|
1368 |
|
|
|
1419 |
rougeL = median([results['rougeL'] for results in results_list])
|
1420 |
|
1421 |
except:
|
|
|
1422 |
rouge1 = -1
|
1423 |
rouge2 = -1
|
1424 |
rougeL = -1
|
|
|
1478 |
rougeL = median([results['rougeL'] for results in results_list])
|
1479 |
|
1480 |
except:
|
|
|
1481 |
rouge1 = -1
|
1482 |
rouge2 = -1
|
1483 |
rougeL = -1
|
|
|
1537 |
accuracy = median([results['accuracy'] for results in results_list])
|
1538 |
|
1539 |
except:
|
|
|
1540 |
accuracy = -1
|
1541 |
|
1542 |
|
|
|
1592 |
accuracy = median([results['accuracy'] for results in results_list])
|
1593 |
|
1594 |
except:
|
|
|
1595 |
accuracy = -1
|
1596 |
|
1597 |
|
|
|
1648 |
accuracy = median([results['accuracy'] for results in results_list])
|
1649 |
|
1650 |
except:
|
|
|
1651 |
accuracy = -1
|
1652 |
|
1653 |
|
|
|
1704 |
accuracy = median([results['accuracy'] for results in results_list])
|
1705 |
|
1706 |
except:
|
|
|
1707 |
accuracy = -1
|
1708 |
|
1709 |
|
|
|
1760 |
accuracy = median([results['accuracy'] for results in results_list])
|
1761 |
|
1762 |
except:
|
|
|
1763 |
accuracy = -1
|
1764 |
|
1765 |
|
|
|
1816 |
accuracy = median([results['accuracy'] for results in results_list])
|
1817 |
|
1818 |
except:
|
|
|
1819 |
accuracy = -1
|
1820 |
|
1821 |
|
|
|
1872 |
accuracy = median([results['accuracy'] for results in results_list])
|
1873 |
|
1874 |
except:
|
|
|
1875 |
accuracy = -1
|
1876 |
|
1877 |
|
|
|
1929 |
accuracy = median([results['accuracy'] for results in results_list])
|
1930 |
|
1931 |
except:
|
|
|
1932 |
accuracy = -1
|
1933 |
|
1934 |
|
|
|
3016 |
|
3017 |
|
3018 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3019 |
gr.Markdown(r"""
|
3020 |
|
3021 |
If this work is useful to you, please citing our work:
|