diff --git "a/distilbert/distilbert-base-cased/checkpoint-738/trainer_state.json" "b/distilbert/distilbert-base-cased/checkpoint-738/trainer_state.json" deleted file mode 100644--- "a/distilbert/distilbert-base-cased/checkpoint-738/trainer_state.json" +++ /dev/null @@ -1,5251 +0,0 @@ -{ - "best_global_step": 738, - "best_metric": 0.28349459171295166, - "best_model_checkpoint": "./my_model/checkpoint-738", - "epoch": 3.0, - "eval_steps": 500, - "global_step": 738, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0040650406504065045, - "grad_norm": 6.3279547691345215, - "learning_rate": 0.0, - "loss": 1.2435, - "step": 1 - }, - { - "epoch": 0.008130081300813009, - "grad_norm": 6.448081016540527, - "learning_rate": 2.5e-06, - "loss": 1.2287, - "step": 2 - }, - { - "epoch": 0.012195121951219513, - "grad_norm": 5.895285606384277, - "learning_rate": 5e-06, - "loss": 1.2332, - "step": 3 - }, - { - "epoch": 0.016260162601626018, - "grad_norm": 5.525364398956299, - "learning_rate": 7.5e-06, - "loss": 1.2305, - "step": 4 - }, - { - "epoch": 0.02032520325203252, - "grad_norm": 5.8905134201049805, - "learning_rate": 1e-05, - "loss": 1.1847, - "step": 5 - }, - { - "epoch": 0.024390243902439025, - "grad_norm": 5.951143741607666, - "learning_rate": 1.25e-05, - "loss": 1.1543, - "step": 6 - }, - { - "epoch": 0.028455284552845527, - "grad_norm": 6.171472549438477, - "learning_rate": 1.5e-05, - "loss": 1.0995, - "step": 7 - }, - { - "epoch": 0.032520325203252036, - "grad_norm": 5.739941120147705, - "learning_rate": 1.75e-05, - "loss": 1.0581, - "step": 8 - }, - { - "epoch": 0.036585365853658534, - "grad_norm": 5.684756278991699, - "learning_rate": 2e-05, - "loss": 1.0026, - "step": 9 - }, - { - "epoch": 0.04065040650406504, - "grad_norm": 5.360750675201416, - "learning_rate": 2.25e-05, - "loss": 0.958, - "step": 10 - }, - { - "epoch": 0.044715447154471545, - "grad_norm": 5.01375150680542, - "learning_rate": 2.5e-05, - "loss": 0.8785, - "step": 11 - }, - { - "epoch": 0.04878048780487805, - "grad_norm": 4.2232208251953125, - "learning_rate": 2.7500000000000004e-05, - "loss": 0.8611, - "step": 12 - }, - { - "epoch": 0.052845528455284556, - "grad_norm": 4.456465244293213, - "learning_rate": 3e-05, - "loss": 0.7041, - "step": 13 - }, - { - "epoch": 0.056910569105691054, - "grad_norm": 3.749004602432251, - "learning_rate": 3.2500000000000004e-05, - "loss": 0.6202, - "step": 14 - }, - { - "epoch": 0.06097560975609756, - "grad_norm": 2.679121971130371, - "learning_rate": 3.5e-05, - "loss": 0.5749, - "step": 15 - }, - { - "epoch": 0.06504065040650407, - "grad_norm": 3.1547088623046875, - "learning_rate": 3.7500000000000003e-05, - "loss": 0.4849, - "step": 16 - }, - { - "epoch": 0.06910569105691057, - "grad_norm": 2.5799901485443115, - "learning_rate": 4e-05, - "loss": 0.4483, - "step": 17 - }, - { - "epoch": 0.07317073170731707, - "grad_norm": 0.5943020582199097, - "learning_rate": 4.25e-05, - "loss": 0.5245, - "step": 18 - }, - { - "epoch": 0.07723577235772358, - "grad_norm": 1.0397089719772339, - "learning_rate": 4.5e-05, - "loss": 0.3429, - "step": 19 - }, - { - "epoch": 0.08130081300813008, - "grad_norm": 0.7015799283981323, - "learning_rate": 4.75e-05, - "loss": 0.4437, - "step": 20 - }, - { - "epoch": 0.08536585365853659, - "grad_norm": 0.8231913447380066, - "learning_rate": 5e-05, - "loss": 0.503, - "step": 21 - }, - { - "epoch": 0.08943089430894309, - "grad_norm": 0.45596688985824585, - "learning_rate": 4.999991573663978e-05, - "loss": 0.3543, - "step": 22 - }, - { - "epoch": 0.09349593495934959, - "grad_norm": 1.0425209999084473, - "learning_rate": 4.999966294712711e-05, - "loss": 0.5317, - "step": 23 - }, - { - "epoch": 0.0975609756097561, - "grad_norm": 0.4913753271102905, - "learning_rate": 4.999924163316609e-05, - "loss": 0.4155, - "step": 24 - }, - { - "epoch": 0.1016260162601626, - "grad_norm": 0.5138763189315796, - "learning_rate": 4.999865179759683e-05, - "loss": 0.5109, - "step": 25 - }, - { - "epoch": 0.10569105691056911, - "grad_norm": 1.0378741025924683, - "learning_rate": 4.999789344439543e-05, - "loss": 0.565, - "step": 26 - }, - { - "epoch": 0.10975609756097561, - "grad_norm": 1.4908039569854736, - "learning_rate": 4.999696657867401e-05, - "loss": 0.2993, - "step": 27 - }, - { - "epoch": 0.11382113821138211, - "grad_norm": 0.6371445655822754, - "learning_rate": 4.999587120668063e-05, - "loss": 0.4997, - "step": 28 - }, - { - "epoch": 0.11788617886178862, - "grad_norm": 0.6722875833511353, - "learning_rate": 4.9994607335799276e-05, - "loss": 0.4253, - "step": 29 - }, - { - "epoch": 0.12195121951219512, - "grad_norm": 0.7855675220489502, - "learning_rate": 4.999317497454979e-05, - "loss": 0.386, - "step": 30 - }, - { - "epoch": 0.12601626016260162, - "grad_norm": 0.5912953019142151, - "learning_rate": 4.9991574132587815e-05, - "loss": 0.4176, - "step": 31 - }, - { - "epoch": 0.13008130081300814, - "grad_norm": 0.7740069627761841, - "learning_rate": 4.9989804820704735e-05, - "loss": 0.5273, - "step": 32 - }, - { - "epoch": 0.13414634146341464, - "grad_norm": 0.3601555824279785, - "learning_rate": 4.99878670508276e-05, - "loss": 0.4005, - "step": 33 - }, - { - "epoch": 0.13821138211382114, - "grad_norm": 0.7042465209960938, - "learning_rate": 4.9985760836019055e-05, - "loss": 0.4822, - "step": 34 - }, - { - "epoch": 0.14227642276422764, - "grad_norm": 0.41130900382995605, - "learning_rate": 4.998348619047725e-05, - "loss": 0.4024, - "step": 35 - }, - { - "epoch": 0.14634146341463414, - "grad_norm": 0.6644321084022522, - "learning_rate": 4.9981043129535704e-05, - "loss": 0.5214, - "step": 36 - }, - { - "epoch": 0.15040650406504066, - "grad_norm": 0.45171546936035156, - "learning_rate": 4.997843166966327e-05, - "loss": 0.484, - "step": 37 - }, - { - "epoch": 0.15447154471544716, - "grad_norm": 0.7962363958358765, - "learning_rate": 4.997565182846399e-05, - "loss": 0.4892, - "step": 38 - }, - { - "epoch": 0.15853658536585366, - "grad_norm": 0.5599137544631958, - "learning_rate": 4.9972703624676944e-05, - "loss": 0.4963, - "step": 39 - }, - { - "epoch": 0.16260162601626016, - "grad_norm": 0.9151962399482727, - "learning_rate": 4.996958707817619e-05, - "loss": 0.3951, - "step": 40 - }, - { - "epoch": 0.16666666666666666, - "grad_norm": 0.588839590549469, - "learning_rate": 4.996630220997058e-05, - "loss": 0.4232, - "step": 41 - }, - { - "epoch": 0.17073170731707318, - "grad_norm": 0.7585485577583313, - "learning_rate": 4.996284904220363e-05, - "loss": 0.3121, - "step": 42 - }, - { - "epoch": 0.17479674796747968, - "grad_norm": 0.6970006227493286, - "learning_rate": 4.995922759815339e-05, - "loss": 0.4766, - "step": 43 - }, - { - "epoch": 0.17886178861788618, - "grad_norm": 0.5253265500068665, - "learning_rate": 4.995543790223227e-05, - "loss": 0.3858, - "step": 44 - }, - { - "epoch": 0.18292682926829268, - "grad_norm": 0.836520254611969, - "learning_rate": 4.995147997998685e-05, - "loss": 0.4203, - "step": 45 - }, - { - "epoch": 0.18699186991869918, - "grad_norm": 0.6241826415061951, - "learning_rate": 4.994735385809777e-05, - "loss": 0.382, - "step": 46 - }, - { - "epoch": 0.1910569105691057, - "grad_norm": 0.45944222807884216, - "learning_rate": 4.9943059564379504e-05, - "loss": 0.3258, - "step": 47 - }, - { - "epoch": 0.1951219512195122, - "grad_norm": 0.9010629653930664, - "learning_rate": 4.9938597127780173e-05, - "loss": 0.492, - "step": 48 - }, - { - "epoch": 0.1991869918699187, - "grad_norm": 0.7751436829566956, - "learning_rate": 4.993396657838138e-05, - "loss": 0.4161, - "step": 49 - }, - { - "epoch": 0.2032520325203252, - "grad_norm": 1.1746246814727783, - "learning_rate": 4.992916794739796e-05, - "loss": 0.3404, - "step": 50 - }, - { - "epoch": 0.2073170731707317, - "grad_norm": 0.6498550176620483, - "learning_rate": 4.992420126717784e-05, - "loss": 0.4203, - "step": 51 - }, - { - "epoch": 0.21138211382113822, - "grad_norm": 0.5611834526062012, - "learning_rate": 4.9919066571201725e-05, - "loss": 0.3227, - "step": 52 - }, - { - "epoch": 0.21544715447154472, - "grad_norm": 0.5788379311561584, - "learning_rate": 4.991376389408297e-05, - "loss": 0.3091, - "step": 53 - }, - { - "epoch": 0.21951219512195122, - "grad_norm": 1.3961076736450195, - "learning_rate": 4.9908293271567286e-05, - "loss": 0.388, - "step": 54 - }, - { - "epoch": 0.22357723577235772, - "grad_norm": 0.7671408653259277, - "learning_rate": 4.990265474053252e-05, - "loss": 0.3864, - "step": 55 - }, - { - "epoch": 0.22764227642276422, - "grad_norm": 0.9589133858680725, - "learning_rate": 4.989684833898838e-05, - "loss": 0.3643, - "step": 56 - }, - { - "epoch": 0.23170731707317074, - "grad_norm": 0.8210120797157288, - "learning_rate": 4.9890874106076236e-05, - "loss": 0.3432, - "step": 57 - }, - { - "epoch": 0.23577235772357724, - "grad_norm": 1.0713058710098267, - "learning_rate": 4.988473208206879e-05, - "loss": 0.4467, - "step": 58 - }, - { - "epoch": 0.23983739837398374, - "grad_norm": 1.2754943370819092, - "learning_rate": 4.987842230836986e-05, - "loss": 0.4288, - "step": 59 - }, - { - "epoch": 0.24390243902439024, - "grad_norm": 0.9665277004241943, - "learning_rate": 4.987194482751406e-05, - "loss": 0.4407, - "step": 60 - }, - { - "epoch": 0.24796747967479674, - "grad_norm": 1.2185570001602173, - "learning_rate": 4.986529968316653e-05, - "loss": 0.4629, - "step": 61 - }, - { - "epoch": 0.25203252032520324, - "grad_norm": 1.9038009643554688, - "learning_rate": 4.985848692012266e-05, - "loss": 0.4773, - "step": 62 - }, - { - "epoch": 0.25609756097560976, - "grad_norm": 1.347111701965332, - "learning_rate": 4.985150658430774e-05, - "loss": 0.3596, - "step": 63 - }, - { - "epoch": 0.2601626016260163, - "grad_norm": 1.0209671258926392, - "learning_rate": 4.9844358722776695e-05, - "loss": 0.3613, - "step": 64 - }, - { - "epoch": 0.26422764227642276, - "grad_norm": 2.5065293312072754, - "learning_rate": 4.9837043383713753e-05, - "loss": 0.6948, - "step": 65 - }, - { - "epoch": 0.2682926829268293, - "grad_norm": 0.9995410442352295, - "learning_rate": 4.982956061643212e-05, - "loss": 0.4096, - "step": 66 - }, - { - "epoch": 0.27235772357723576, - "grad_norm": 1.4097882509231567, - "learning_rate": 4.982191047137366e-05, - "loss": 0.3713, - "step": 67 - }, - { - "epoch": 0.2764227642276423, - "grad_norm": 1.068121075630188, - "learning_rate": 4.98140930001085e-05, - "loss": 0.4966, - "step": 68 - }, - { - "epoch": 0.2804878048780488, - "grad_norm": 1.7965993881225586, - "learning_rate": 4.980610825533476e-05, - "loss": 0.3387, - "step": 69 - }, - { - "epoch": 0.2845528455284553, - "grad_norm": 1.1957075595855713, - "learning_rate": 4.979795629087817e-05, - "loss": 0.4179, - "step": 70 - }, - { - "epoch": 0.2886178861788618, - "grad_norm": 1.0213898420333862, - "learning_rate": 4.978963716169166e-05, - "loss": 0.4147, - "step": 71 - }, - { - "epoch": 0.2926829268292683, - "grad_norm": 0.9609593749046326, - "learning_rate": 4.978115092385507e-05, - "loss": 0.3212, - "step": 72 - }, - { - "epoch": 0.2967479674796748, - "grad_norm": 0.6978228092193604, - "learning_rate": 4.9772497634574706e-05, - "loss": 0.3167, - "step": 73 - }, - { - "epoch": 0.3008130081300813, - "grad_norm": 2.0379698276519775, - "learning_rate": 4.976367735218299e-05, - "loss": 0.5018, - "step": 74 - }, - { - "epoch": 0.3048780487804878, - "grad_norm": 0.8079779744148254, - "learning_rate": 4.975469013613804e-05, - "loss": 0.3043, - "step": 75 - }, - { - "epoch": 0.3089430894308943, - "grad_norm": 1.2687965631484985, - "learning_rate": 4.9745536047023324e-05, - "loss": 0.4565, - "step": 76 - }, - { - "epoch": 0.3130081300813008, - "grad_norm": 1.728155493736267, - "learning_rate": 4.9736215146547163e-05, - "loss": 0.2679, - "step": 77 - }, - { - "epoch": 0.3170731707317073, - "grad_norm": 1.3003721237182617, - "learning_rate": 4.972672749754239e-05, - "loss": 0.339, - "step": 78 - }, - { - "epoch": 0.32113821138211385, - "grad_norm": 1.483816146850586, - "learning_rate": 4.971707316396592e-05, - "loss": 0.3669, - "step": 79 - }, - { - "epoch": 0.3252032520325203, - "grad_norm": 1.7120908498764038, - "learning_rate": 4.970725221089826e-05, - "loss": 0.2839, - "step": 80 - }, - { - "epoch": 0.32926829268292684, - "grad_norm": 1.1175329685211182, - "learning_rate": 4.969726470454313e-05, - "loss": 0.2924, - "step": 81 - }, - { - "epoch": 0.3333333333333333, - "grad_norm": 2.614908456802368, - "learning_rate": 4.968711071222701e-05, - "loss": 0.5139, - "step": 82 - }, - { - "epoch": 0.33739837398373984, - "grad_norm": 1.713719129562378, - "learning_rate": 4.9676790302398654e-05, - "loss": 0.4138, - "step": 83 - }, - { - "epoch": 0.34146341463414637, - "grad_norm": 2.611751079559326, - "learning_rate": 4.966630354462866e-05, - "loss": 0.4368, - "step": 84 - }, - { - "epoch": 0.34552845528455284, - "grad_norm": 1.33542001247406, - "learning_rate": 4.965565050960897e-05, - "loss": 0.389, - "step": 85 - }, - { - "epoch": 0.34959349593495936, - "grad_norm": 1.5493632555007935, - "learning_rate": 4.964483126915245e-05, - "loss": 0.4972, - "step": 86 - }, - { - "epoch": 0.35365853658536583, - "grad_norm": 1.2695201635360718, - "learning_rate": 4.963384589619233e-05, - "loss": 0.3675, - "step": 87 - }, - { - "epoch": 0.35772357723577236, - "grad_norm": 1.9259226322174072, - "learning_rate": 4.962269446478176e-05, - "loss": 0.3198, - "step": 88 - }, - { - "epoch": 0.3617886178861789, - "grad_norm": 1.4687918424606323, - "learning_rate": 4.961137705009331e-05, - "loss": 0.2853, - "step": 89 - }, - { - "epoch": 0.36585365853658536, - "grad_norm": 0.9229425191879272, - "learning_rate": 4.959989372841846e-05, - "loss": 0.4061, - "step": 90 - }, - { - "epoch": 0.3699186991869919, - "grad_norm": 0.8202106356620789, - "learning_rate": 4.9588244577167065e-05, - "loss": 0.1883, - "step": 91 - }, - { - "epoch": 0.37398373983739835, - "grad_norm": 1.8579500913619995, - "learning_rate": 4.9576429674866854e-05, - "loss": 0.479, - "step": 92 - }, - { - "epoch": 0.3780487804878049, - "grad_norm": 1.8375589847564697, - "learning_rate": 4.956444910116289e-05, - "loss": 0.4011, - "step": 93 - }, - { - "epoch": 0.3821138211382114, - "grad_norm": 1.1253693103790283, - "learning_rate": 4.9552302936817066e-05, - "loss": 0.3982, - "step": 94 - }, - { - "epoch": 0.3861788617886179, - "grad_norm": 1.9736450910568237, - "learning_rate": 4.953999126370749e-05, - "loss": 0.4298, - "step": 95 - }, - { - "epoch": 0.3902439024390244, - "grad_norm": 1.7076289653778076, - "learning_rate": 4.952751416482801e-05, - "loss": 0.4045, - "step": 96 - }, - { - "epoch": 0.3943089430894309, - "grad_norm": 0.975965142250061, - "learning_rate": 4.9514871724287604e-05, - "loss": 0.3756, - "step": 97 - }, - { - "epoch": 0.3983739837398374, - "grad_norm": 1.2571923732757568, - "learning_rate": 4.9502064027309836e-05, - "loss": 0.4453, - "step": 98 - }, - { - "epoch": 0.4024390243902439, - "grad_norm": 1.007575511932373, - "learning_rate": 4.948909116023227e-05, - "loss": 0.3667, - "step": 99 - }, - { - "epoch": 0.4065040650406504, - "grad_norm": 1.2273565530776978, - "learning_rate": 4.94759532105059e-05, - "loss": 0.3432, - "step": 100 - }, - { - "epoch": 0.4105691056910569, - "grad_norm": 1.224801778793335, - "learning_rate": 4.9462650266694544e-05, - "loss": 0.3931, - "step": 101 - }, - { - "epoch": 0.4146341463414634, - "grad_norm": 1.5677101612091064, - "learning_rate": 4.944918241847426e-05, - "loss": 0.3417, - "step": 102 - }, - { - "epoch": 0.4186991869918699, - "grad_norm": 0.8385095596313477, - "learning_rate": 4.943554975663275e-05, - "loss": 0.3336, - "step": 103 - }, - { - "epoch": 0.42276422764227645, - "grad_norm": 0.7258734703063965, - "learning_rate": 4.9421752373068706e-05, - "loss": 0.3165, - "step": 104 - }, - { - "epoch": 0.4268292682926829, - "grad_norm": 0.8504552245140076, - "learning_rate": 4.940779036079126e-05, - "loss": 0.3691, - "step": 105 - }, - { - "epoch": 0.43089430894308944, - "grad_norm": 0.8210578560829163, - "learning_rate": 4.93936638139193e-05, - "loss": 0.3679, - "step": 106 - }, - { - "epoch": 0.4349593495934959, - "grad_norm": 0.9961466789245605, - "learning_rate": 4.937937282768083e-05, - "loss": 0.3625, - "step": 107 - }, - { - "epoch": 0.43902439024390244, - "grad_norm": 1.525476098060608, - "learning_rate": 4.9364917498412386e-05, - "loss": 0.3311, - "step": 108 - }, - { - "epoch": 0.44308943089430897, - "grad_norm": 1.3158917427062988, - "learning_rate": 4.935029792355834e-05, - "loss": 0.243, - "step": 109 - }, - { - "epoch": 0.44715447154471544, - "grad_norm": 1.0422078371047974, - "learning_rate": 4.9335514201670244e-05, - "loss": 0.388, - "step": 110 - }, - { - "epoch": 0.45121951219512196, - "grad_norm": 1.8480043411254883, - "learning_rate": 4.932056643240618e-05, - "loss": 0.3721, - "step": 111 - }, - { - "epoch": 0.45528455284552843, - "grad_norm": 1.6751759052276611, - "learning_rate": 4.9305454716530105e-05, - "loss": 0.3299, - "step": 112 - }, - { - "epoch": 0.45934959349593496, - "grad_norm": 0.9859967231750488, - "learning_rate": 4.929017915591113e-05, - "loss": 0.4039, - "step": 113 - }, - { - "epoch": 0.4634146341463415, - "grad_norm": 1.3842353820800781, - "learning_rate": 4.927473985352285e-05, - "loss": 0.431, - "step": 114 - }, - { - "epoch": 0.46747967479674796, - "grad_norm": 1.3645702600479126, - "learning_rate": 4.925913691344268e-05, - "loss": 0.326, - "step": 115 - }, - { - "epoch": 0.4715447154471545, - "grad_norm": 1.3944858312606812, - "learning_rate": 4.92433704408511e-05, - "loss": 0.263, - "step": 116 - }, - { - "epoch": 0.47560975609756095, - "grad_norm": 0.7792590260505676, - "learning_rate": 4.922744054203099e-05, - "loss": 0.3452, - "step": 117 - }, - { - "epoch": 0.4796747967479675, - "grad_norm": 1.0302146673202515, - "learning_rate": 4.92113473243669e-05, - "loss": 0.3107, - "step": 118 - }, - { - "epoch": 0.483739837398374, - "grad_norm": 0.9145269989967346, - "learning_rate": 4.919509089634431e-05, - "loss": 0.2691, - "step": 119 - }, - { - "epoch": 0.4878048780487805, - "grad_norm": 2.088017225265503, - "learning_rate": 4.917867136754893e-05, - "loss": 0.4763, - "step": 120 - }, - { - "epoch": 0.491869918699187, - "grad_norm": 3.1140401363372803, - "learning_rate": 4.916208884866593e-05, - "loss": 0.4667, - "step": 121 - }, - { - "epoch": 0.4959349593495935, - "grad_norm": 0.8972907662391663, - "learning_rate": 4.9145343451479196e-05, - "loss": 0.2483, - "step": 122 - }, - { - "epoch": 0.5, - "grad_norm": 0.9712522029876709, - "learning_rate": 4.912843528887063e-05, - "loss": 0.1997, - "step": 123 - }, - { - "epoch": 0.5040650406504065, - "grad_norm": 1.6554665565490723, - "learning_rate": 4.91113644748193e-05, - "loss": 0.4075, - "step": 124 - }, - { - "epoch": 0.508130081300813, - "grad_norm": 0.718744695186615, - "learning_rate": 4.909413112440075e-05, - "loss": 0.2702, - "step": 125 - }, - { - "epoch": 0.5121951219512195, - "grad_norm": 1.1200222969055176, - "learning_rate": 4.907673535378616e-05, - "loss": 0.3452, - "step": 126 - }, - { - "epoch": 0.516260162601626, - "grad_norm": 1.1216825246810913, - "learning_rate": 4.905917728024164e-05, - "loss": 0.3963, - "step": 127 - }, - { - "epoch": 0.5203252032520326, - "grad_norm": 0.9640887975692749, - "learning_rate": 4.9041457022127364e-05, - "loss": 0.3517, - "step": 128 - }, - { - "epoch": 0.524390243902439, - "grad_norm": 1.2504209280014038, - "learning_rate": 4.902357469889681e-05, - "loss": 0.3201, - "step": 129 - }, - { - "epoch": 0.5284552845528455, - "grad_norm": 1.415420413017273, - "learning_rate": 4.900553043109595e-05, - "loss": 0.4236, - "step": 130 - }, - { - "epoch": 0.532520325203252, - "grad_norm": 1.2397375106811523, - "learning_rate": 4.898732434036244e-05, - "loss": 0.2865, - "step": 131 - }, - { - "epoch": 0.5365853658536586, - "grad_norm": 1.0056957006454468, - "learning_rate": 4.896895654942478e-05, - "loss": 0.3202, - "step": 132 - }, - { - "epoch": 0.540650406504065, - "grad_norm": 1.1378791332244873, - "learning_rate": 4.895042718210152e-05, - "loss": 0.318, - "step": 133 - }, - { - "epoch": 0.5447154471544715, - "grad_norm": 1.9902440309524536, - "learning_rate": 4.893173636330041e-05, - "loss": 0.3267, - "step": 134 - }, - { - "epoch": 0.5487804878048781, - "grad_norm": 1.6747770309448242, - "learning_rate": 4.891288421901752e-05, - "loss": 0.3316, - "step": 135 - }, - { - "epoch": 0.5528455284552846, - "grad_norm": 1.6044206619262695, - "learning_rate": 4.889387087633647e-05, - "loss": 0.3411, - "step": 136 - }, - { - "epoch": 0.556910569105691, - "grad_norm": 1.5715720653533936, - "learning_rate": 4.887469646342752e-05, - "loss": 0.3324, - "step": 137 - }, - { - "epoch": 0.5609756097560976, - "grad_norm": 1.0535733699798584, - "learning_rate": 4.885536110954668e-05, - "loss": 0.2966, - "step": 138 - }, - { - "epoch": 0.5650406504065041, - "grad_norm": 1.5174460411071777, - "learning_rate": 4.883586494503492e-05, - "loss": 0.4864, - "step": 139 - }, - { - "epoch": 0.5691056910569106, - "grad_norm": 1.403935194015503, - "learning_rate": 4.881620810131723e-05, - "loss": 0.3292, - "step": 140 - }, - { - "epoch": 0.573170731707317, - "grad_norm": 1.1938104629516602, - "learning_rate": 4.879639071090174e-05, - "loss": 0.3094, - "step": 141 - }, - { - "epoch": 0.5772357723577236, - "grad_norm": 1.3123348951339722, - "learning_rate": 4.877641290737884e-05, - "loss": 0.3466, - "step": 142 - }, - { - "epoch": 0.5813008130081301, - "grad_norm": 1.3113127946853638, - "learning_rate": 4.875627482542028e-05, - "loss": 0.4126, - "step": 143 - }, - { - "epoch": 0.5853658536585366, - "grad_norm": 0.9945210218429565, - "learning_rate": 4.8735976600778253e-05, - "loss": 0.1674, - "step": 144 - }, - { - "epoch": 0.5894308943089431, - "grad_norm": 1.1661075353622437, - "learning_rate": 4.87155183702845e-05, - "loss": 0.3208, - "step": 145 - }, - { - "epoch": 0.5934959349593496, - "grad_norm": 1.2174713611602783, - "learning_rate": 4.869490027184935e-05, - "loss": 0.2578, - "step": 146 - }, - { - "epoch": 0.5975609756097561, - "grad_norm": 1.0534899234771729, - "learning_rate": 4.867412244446082e-05, - "loss": 0.2625, - "step": 147 - }, - { - "epoch": 0.6016260162601627, - "grad_norm": 1.3291311264038086, - "learning_rate": 4.865318502818369e-05, - "loss": 0.4193, - "step": 148 - }, - { - "epoch": 0.6056910569105691, - "grad_norm": 1.0063292980194092, - "learning_rate": 4.863208816415851e-05, - "loss": 0.1728, - "step": 149 - }, - { - "epoch": 0.6097560975609756, - "grad_norm": 1.4692482948303223, - "learning_rate": 4.86108319946007e-05, - "loss": 0.4379, - "step": 150 - }, - { - "epoch": 0.6138211382113821, - "grad_norm": 1.0334157943725586, - "learning_rate": 4.858941666279955e-05, - "loss": 0.2875, - "step": 151 - }, - { - "epoch": 0.6178861788617886, - "grad_norm": 2.2277495861053467, - "learning_rate": 4.8567842313117304e-05, - "loss": 0.4843, - "step": 152 - }, - { - "epoch": 0.6219512195121951, - "grad_norm": 2.0515499114990234, - "learning_rate": 4.854610909098812e-05, - "loss": 0.3953, - "step": 153 - }, - { - "epoch": 0.6260162601626016, - "grad_norm": 1.1603871583938599, - "learning_rate": 4.852421714291716e-05, - "loss": 0.4077, - "step": 154 - }, - { - "epoch": 0.6300813008130082, - "grad_norm": 1.225406289100647, - "learning_rate": 4.8502166616479535e-05, - "loss": 0.3288, - "step": 155 - }, - { - "epoch": 0.6341463414634146, - "grad_norm": 1.3822065591812134, - "learning_rate": 4.847995766031937e-05, - "loss": 0.401, - "step": 156 - }, - { - "epoch": 0.6382113821138211, - "grad_norm": 1.425358772277832, - "learning_rate": 4.845759042414878e-05, - "loss": 0.3018, - "step": 157 - }, - { - "epoch": 0.6422764227642277, - "grad_norm": 1.0022040605545044, - "learning_rate": 4.843506505874682e-05, - "loss": 0.3792, - "step": 158 - }, - { - "epoch": 0.6463414634146342, - "grad_norm": 1.552306890487671, - "learning_rate": 4.841238171595854e-05, - "loss": 0.2577, - "step": 159 - }, - { - "epoch": 0.6504065040650406, - "grad_norm": 0.9894719123840332, - "learning_rate": 4.838954054869392e-05, - "loss": 0.3077, - "step": 160 - }, - { - "epoch": 0.6544715447154471, - "grad_norm": 1.1398520469665527, - "learning_rate": 4.8366541710926825e-05, - "loss": 0.3246, - "step": 161 - }, - { - "epoch": 0.6585365853658537, - "grad_norm": 1.2690491676330566, - "learning_rate": 4.8343385357694025e-05, - "loss": 0.2767, - "step": 162 - }, - { - "epoch": 0.6626016260162602, - "grad_norm": 2.858774185180664, - "learning_rate": 4.8320071645094064e-05, - "loss": 0.5551, - "step": 163 - }, - { - "epoch": 0.6666666666666666, - "grad_norm": 1.0194021463394165, - "learning_rate": 4.829660073028631e-05, - "loss": 0.2819, - "step": 164 - }, - { - "epoch": 0.6707317073170732, - "grad_norm": 1.0878033638000488, - "learning_rate": 4.82729727714898e-05, - "loss": 0.3384, - "step": 165 - }, - { - "epoch": 0.6747967479674797, - "grad_norm": 1.1534984111785889, - "learning_rate": 4.824918792798222e-05, - "loss": 0.1882, - "step": 166 - }, - { - "epoch": 0.6788617886178862, - "grad_norm": 1.8613225221633911, - "learning_rate": 4.8225246360098866e-05, - "loss": 0.5478, - "step": 167 - }, - { - "epoch": 0.6829268292682927, - "grad_norm": 1.1923648118972778, - "learning_rate": 4.820114822923147e-05, - "loss": 0.2842, - "step": 168 - }, - { - "epoch": 0.6869918699186992, - "grad_norm": 1.3888219594955444, - "learning_rate": 4.8176893697827196e-05, - "loss": 0.3278, - "step": 169 - }, - { - "epoch": 0.6910569105691057, - "grad_norm": 0.8846197724342346, - "learning_rate": 4.815248292938752e-05, - "loss": 0.2993, - "step": 170 - }, - { - "epoch": 0.6951219512195121, - "grad_norm": 0.9156147837638855, - "learning_rate": 4.81279160884671e-05, - "loss": 0.2249, - "step": 171 - }, - { - "epoch": 0.6991869918699187, - "grad_norm": 1.1384847164154053, - "learning_rate": 4.8103193340672706e-05, - "loss": 0.2549, - "step": 172 - }, - { - "epoch": 0.7032520325203252, - "grad_norm": 1.067744255065918, - "learning_rate": 4.807831485266208e-05, - "loss": 0.2536, - "step": 173 - }, - { - "epoch": 0.7073170731707317, - "grad_norm": 1.4895604848861694, - "learning_rate": 4.805328079214282e-05, - "loss": 0.3458, - "step": 174 - }, - { - "epoch": 0.7113821138211383, - "grad_norm": 1.5581872463226318, - "learning_rate": 4.802809132787125e-05, - "loss": 0.3289, - "step": 175 - }, - { - "epoch": 0.7154471544715447, - "grad_norm": 1.9799189567565918, - "learning_rate": 4.80027466296513e-05, - "loss": 0.4856, - "step": 176 - }, - { - "epoch": 0.7195121951219512, - "grad_norm": 0.6421610116958618, - "learning_rate": 4.79772468683333e-05, - "loss": 0.2513, - "step": 177 - }, - { - "epoch": 0.7235772357723578, - "grad_norm": 1.1410386562347412, - "learning_rate": 4.795159221581291e-05, - "loss": 0.3893, - "step": 178 - }, - { - "epoch": 0.7276422764227642, - "grad_norm": 0.8020047545433044, - "learning_rate": 4.792578284502991e-05, - "loss": 0.2941, - "step": 179 - }, - { - "epoch": 0.7317073170731707, - "grad_norm": 1.1755205392837524, - "learning_rate": 4.7899818929967035e-05, - "loss": 0.3193, - "step": 180 - }, - { - "epoch": 0.7357723577235772, - "grad_norm": 1.0635044574737549, - "learning_rate": 4.787370064564883e-05, - "loss": 0.3861, - "step": 181 - }, - { - "epoch": 0.7398373983739838, - "grad_norm": 1.2178239822387695, - "learning_rate": 4.7847428168140447e-05, - "loss": 0.2827, - "step": 182 - }, - { - "epoch": 0.7439024390243902, - "grad_norm": 1.1486563682556152, - "learning_rate": 4.782100167454646e-05, - "loss": 0.4087, - "step": 183 - }, - { - "epoch": 0.7479674796747967, - "grad_norm": 1.378288745880127, - "learning_rate": 4.779442134300968e-05, - "loss": 0.3546, - "step": 184 - }, - { - "epoch": 0.7520325203252033, - "grad_norm": 1.416754961013794, - "learning_rate": 4.776768735270996e-05, - "loss": 0.3753, - "step": 185 - }, - { - "epoch": 0.7560975609756098, - "grad_norm": 1.3719738721847534, - "learning_rate": 4.774079988386296e-05, - "loss": 0.3412, - "step": 186 - }, - { - "epoch": 0.7601626016260162, - "grad_norm": 1.213966965675354, - "learning_rate": 4.7713759117718964e-05, - "loss": 0.2584, - "step": 187 - }, - { - "epoch": 0.7642276422764228, - "grad_norm": 1.3669952154159546, - "learning_rate": 4.7686565236561634e-05, - "loss": 0.3264, - "step": 188 - }, - { - "epoch": 0.7682926829268293, - "grad_norm": 1.3080284595489502, - "learning_rate": 4.76592184237068e-05, - "loss": 0.3138, - "step": 189 - }, - { - "epoch": 0.7723577235772358, - "grad_norm": 1.573143482208252, - "learning_rate": 4.7631718863501195e-05, - "loss": 0.3101, - "step": 190 - }, - { - "epoch": 0.7764227642276422, - "grad_norm": 1.2406355142593384, - "learning_rate": 4.760406674132126e-05, - "loss": 0.3016, - "step": 191 - }, - { - "epoch": 0.7804878048780488, - "grad_norm": 1.2901965379714966, - "learning_rate": 4.757626224357184e-05, - "loss": 0.2481, - "step": 192 - }, - { - "epoch": 0.7845528455284553, - "grad_norm": 1.1622278690338135, - "learning_rate": 4.754830555768499e-05, - "loss": 0.3275, - "step": 193 - }, - { - "epoch": 0.7886178861788617, - "grad_norm": 1.159242033958435, - "learning_rate": 4.752019687211864e-05, - "loss": 0.2436, - "step": 194 - }, - { - "epoch": 0.7926829268292683, - "grad_norm": 0.9676827788352966, - "learning_rate": 4.749193637635537e-05, - "loss": 0.3343, - "step": 195 - }, - { - "epoch": 0.7967479674796748, - "grad_norm": 1.8030028343200684, - "learning_rate": 4.746352426090114e-05, - "loss": 0.4053, - "step": 196 - }, - { - "epoch": 0.8008130081300813, - "grad_norm": 1.1719882488250732, - "learning_rate": 4.743496071728396e-05, - "loss": 0.3678, - "step": 197 - }, - { - "epoch": 0.8048780487804879, - "grad_norm": 1.0295383930206299, - "learning_rate": 4.740624593805265e-05, - "loss": 0.3732, - "step": 198 - }, - { - "epoch": 0.8089430894308943, - "grad_norm": 1.2781455516815186, - "learning_rate": 4.7377380116775516e-05, - "loss": 0.3153, - "step": 199 - }, - { - "epoch": 0.8130081300813008, - "grad_norm": 1.9800623655319214, - "learning_rate": 4.734836344803905e-05, - "loss": 0.3049, - "step": 200 - }, - { - "epoch": 0.8170731707317073, - "grad_norm": 0.7194023728370667, - "learning_rate": 4.73191961274466e-05, - "loss": 0.288, - "step": 201 - }, - { - "epoch": 0.8211382113821138, - "grad_norm": 0.9485660791397095, - "learning_rate": 4.728987835161708e-05, - "loss": 0.2721, - "step": 202 - }, - { - "epoch": 0.8252032520325203, - "grad_norm": 1.4550368785858154, - "learning_rate": 4.7260410318183644e-05, - "loss": 0.2866, - "step": 203 - }, - { - "epoch": 0.8292682926829268, - "grad_norm": 1.3849658966064453, - "learning_rate": 4.723079222579234e-05, - "loss": 0.3603, - "step": 204 - }, - { - "epoch": 0.8333333333333334, - "grad_norm": 1.3717058897018433, - "learning_rate": 4.7201024274100746e-05, - "loss": 0.2594, - "step": 205 - }, - { - "epoch": 0.8373983739837398, - "grad_norm": 0.7866950631141663, - "learning_rate": 4.7171106663776694e-05, - "loss": 0.2521, - "step": 206 - }, - { - "epoch": 0.8414634146341463, - "grad_norm": 1.1188017129898071, - "learning_rate": 4.714103959649684e-05, - "loss": 0.3075, - "step": 207 - }, - { - "epoch": 0.8455284552845529, - "grad_norm": 1.7865400314331055, - "learning_rate": 4.711082327494536e-05, - "loss": 0.372, - "step": 208 - }, - { - "epoch": 0.8495934959349594, - "grad_norm": 1.3013150691986084, - "learning_rate": 4.708045790281256e-05, - "loss": 0.4147, - "step": 209 - }, - { - "epoch": 0.8536585365853658, - "grad_norm": 0.854185163974762, - "learning_rate": 4.70499436847935e-05, - "loss": 0.3051, - "step": 210 - }, - { - "epoch": 0.8577235772357723, - "grad_norm": 1.2511900663375854, - "learning_rate": 4.7019280826586606e-05, - "loss": 0.4043, - "step": 211 - }, - { - "epoch": 0.8617886178861789, - "grad_norm": 1.4942708015441895, - "learning_rate": 4.698846953489234e-05, - "loss": 0.3171, - "step": 212 - }, - { - "epoch": 0.8658536585365854, - "grad_norm": 0.7822403311729431, - "learning_rate": 4.6957510017411744e-05, - "loss": 0.2763, - "step": 213 - }, - { - "epoch": 0.8699186991869918, - "grad_norm": 1.646827220916748, - "learning_rate": 4.692640248284503e-05, - "loss": 0.2852, - "step": 214 - }, - { - "epoch": 0.8739837398373984, - "grad_norm": 0.889462411403656, - "learning_rate": 4.689514714089025e-05, - "loss": 0.3629, - "step": 215 - }, - { - "epoch": 0.8780487804878049, - "grad_norm": 0.7499379515647888, - "learning_rate": 4.6863744202241805e-05, - "loss": 0.2525, - "step": 216 - }, - { - "epoch": 0.8821138211382114, - "grad_norm": 2.0293996334075928, - "learning_rate": 4.683219387858907e-05, - "loss": 0.5801, - "step": 217 - }, - { - "epoch": 0.8861788617886179, - "grad_norm": 0.9877073168754578, - "learning_rate": 4.680049638261494e-05, - "loss": 0.3128, - "step": 218 - }, - { - "epoch": 0.8902439024390244, - "grad_norm": 0.9436555504798889, - "learning_rate": 4.6768651927994434e-05, - "loss": 0.4697, - "step": 219 - }, - { - "epoch": 0.8943089430894309, - "grad_norm": 1.1047589778900146, - "learning_rate": 4.673666072939319e-05, - "loss": 0.2329, - "step": 220 - }, - { - "epoch": 0.8983739837398373, - "grad_norm": 1.0174518823623657, - "learning_rate": 4.67045230024661e-05, - "loss": 0.2093, - "step": 221 - }, - { - "epoch": 0.9024390243902439, - "grad_norm": 0.9476176500320435, - "learning_rate": 4.667223896385577e-05, - "loss": 0.3064, - "step": 222 - }, - { - "epoch": 0.9065040650406504, - "grad_norm": 1.8664462566375732, - "learning_rate": 4.6639808831191156e-05, - "loss": 0.2539, - "step": 223 - }, - { - "epoch": 0.9105691056910569, - "grad_norm": 1.4686872959136963, - "learning_rate": 4.660723282308599e-05, - "loss": 0.2819, - "step": 224 - }, - { - "epoch": 0.9146341463414634, - "grad_norm": 0.8365880846977234, - "learning_rate": 4.657451115913739e-05, - "loss": 0.1743, - "step": 225 - }, - { - "epoch": 0.9186991869918699, - "grad_norm": 0.8225746750831604, - "learning_rate": 4.654164405992435e-05, - "loss": 0.2097, - "step": 226 - }, - { - "epoch": 0.9227642276422764, - "grad_norm": 1.6638455390930176, - "learning_rate": 4.6508631747006246e-05, - "loss": 0.4371, - "step": 227 - }, - { - "epoch": 0.926829268292683, - "grad_norm": 1.4620201587677002, - "learning_rate": 4.647547444292134e-05, - "loss": 0.3019, - "step": 228 - }, - { - "epoch": 0.9308943089430894, - "grad_norm": 0.8998772501945496, - "learning_rate": 4.644217237118532e-05, - "loss": 0.2587, - "step": 229 - }, - { - "epoch": 0.9349593495934959, - "grad_norm": 1.1790924072265625, - "learning_rate": 4.640872575628973e-05, - "loss": 0.232, - "step": 230 - }, - { - "epoch": 0.9390243902439024, - "grad_norm": 1.0547899007797241, - "learning_rate": 4.6375134823700505e-05, - "loss": 0.3647, - "step": 231 - }, - { - "epoch": 0.943089430894309, - "grad_norm": 1.2183856964111328, - "learning_rate": 4.634139979985643e-05, - "loss": 0.3218, - "step": 232 - }, - { - "epoch": 0.9471544715447154, - "grad_norm": 1.4923841953277588, - "learning_rate": 4.630752091216764e-05, - "loss": 0.3482, - "step": 233 - }, - { - "epoch": 0.9512195121951219, - "grad_norm": 0.7389389276504517, - "learning_rate": 4.6273498389014014e-05, - "loss": 0.2886, - "step": 234 - }, - { - "epoch": 0.9552845528455285, - "grad_norm": 0.8663522601127625, - "learning_rate": 4.623933245974376e-05, - "loss": 0.3292, - "step": 235 - }, - { - "epoch": 0.959349593495935, - "grad_norm": 1.4493833780288696, - "learning_rate": 4.620502335467174e-05, - "loss": 0.2394, - "step": 236 - }, - { - "epoch": 0.9634146341463414, - "grad_norm": 1.0228676795959473, - "learning_rate": 4.6170571305077986e-05, - "loss": 0.2503, - "step": 237 - }, - { - "epoch": 0.967479674796748, - "grad_norm": 1.036851167678833, - "learning_rate": 4.613597654320615e-05, - "loss": 0.3158, - "step": 238 - }, - { - "epoch": 0.9715447154471545, - "grad_norm": 1.463829517364502, - "learning_rate": 4.610123930226189e-05, - "loss": 0.3415, - "step": 239 - }, - { - "epoch": 0.975609756097561, - "grad_norm": 1.3054817914962769, - "learning_rate": 4.606635981641135e-05, - "loss": 0.3325, - "step": 240 - }, - { - "epoch": 0.9796747967479674, - "grad_norm": 1.0887513160705566, - "learning_rate": 4.6031338320779534e-05, - "loss": 0.2462, - "step": 241 - }, - { - "epoch": 0.983739837398374, - "grad_norm": 2.09199595451355, - "learning_rate": 4.599617505144875e-05, - "loss": 0.4012, - "step": 242 - }, - { - "epoch": 0.9878048780487805, - "grad_norm": 1.1317472457885742, - "learning_rate": 4.5960870245457036e-05, - "loss": 0.3834, - "step": 243 - }, - { - "epoch": 0.991869918699187, - "grad_norm": 0.9485702514648438, - "learning_rate": 4.5925424140796505e-05, - "loss": 0.2743, - "step": 244 - }, - { - "epoch": 0.9959349593495935, - "grad_norm": 1.3825092315673828, - "learning_rate": 4.5889836976411796e-05, - "loss": 0.3896, - "step": 245 - }, - { - "epoch": 1.0, - "grad_norm": 1.7264575958251953, - "learning_rate": 4.5854108992198417e-05, - "loss": 0.281, - "step": 246 - }, - { - "epoch": 1.0, - "eval_accuracy": 0.9001145183865632, - "eval_loss": 0.30376696586608887, - "eval_precision_bio": 0.5126050420168067, - "eval_precision_cs": 0.9168211920529802, - "eval_precision_math": 0.4925373134328358, - "eval_precision_physics": 0.44642857142857145, - "eval_recall_bio": 0.3465909090909091, - "eval_recall_cs": 0.9804532577903683, - "eval_recall_math": 0.15492957746478872, - "eval_recall_physics": 0.12690355329949238, - "eval_runtime": 30.7745, - "eval_samples_per_second": 255.373, - "eval_steps_per_second": 31.942, - "step": 246 - }, - { - "epoch": 1.0040650406504066, - "grad_norm": 1.028691053390503, - "learning_rate": 4.581824042900118e-05, - "loss": 0.1808, - "step": 247 - }, - { - "epoch": 1.008130081300813, - "grad_norm": 1.4221172332763672, - "learning_rate": 4.578223152861254e-05, - "loss": 0.3066, - "step": 248 - }, - { - "epoch": 1.0121951219512195, - "grad_norm": 1.193612813949585, - "learning_rate": 4.574608253377096e-05, - "loss": 0.1904, - "step": 249 - }, - { - "epoch": 1.016260162601626, - "grad_norm": 0.8225454092025757, - "learning_rate": 4.570979368815932e-05, - "loss": 0.213, - "step": 250 - }, - { - "epoch": 1.0203252032520325, - "grad_norm": 0.6823122501373291, - "learning_rate": 4.567336523640322e-05, - "loss": 0.2471, - "step": 251 - }, - { - "epoch": 1.024390243902439, - "grad_norm": 0.7302724123001099, - "learning_rate": 4.563679742406935e-05, - "loss": 0.2095, - "step": 252 - }, - { - "epoch": 1.0284552845528456, - "grad_norm": 2.5501978397369385, - "learning_rate": 4.5600090497663864e-05, - "loss": 0.4635, - "step": 253 - }, - { - "epoch": 1.032520325203252, - "grad_norm": 1.0953235626220703, - "learning_rate": 4.556324470463066e-05, - "loss": 0.2199, - "step": 254 - }, - { - "epoch": 1.0365853658536586, - "grad_norm": 2.7306058406829834, - "learning_rate": 4.5526260293349785e-05, - "loss": 0.5122, - "step": 255 - }, - { - "epoch": 1.040650406504065, - "grad_norm": 1.1657317876815796, - "learning_rate": 4.548913751313568e-05, - "loss": 0.1506, - "step": 256 - }, - { - "epoch": 1.0447154471544715, - "grad_norm": 2.3006463050842285, - "learning_rate": 4.545187661423559e-05, - "loss": 0.3945, - "step": 257 - }, - { - "epoch": 1.048780487804878, - "grad_norm": 1.6349135637283325, - "learning_rate": 4.541447784782777e-05, - "loss": 0.4536, - "step": 258 - }, - { - "epoch": 1.0528455284552845, - "grad_norm": 1.4519246816635132, - "learning_rate": 4.537694146601989e-05, - "loss": 0.4033, - "step": 259 - }, - { - "epoch": 1.056910569105691, - "grad_norm": 1.041219711303711, - "learning_rate": 4.533926772184728e-05, - "loss": 0.2806, - "step": 260 - }, - { - "epoch": 1.0609756097560976, - "grad_norm": 1.5814658403396606, - "learning_rate": 4.5301456869271255e-05, - "loss": 0.4728, - "step": 261 - }, - { - "epoch": 1.065040650406504, - "grad_norm": 2.131558418273926, - "learning_rate": 4.5263509163177356e-05, - "loss": 0.2919, - "step": 262 - }, - { - "epoch": 1.0691056910569106, - "grad_norm": 1.8359557390213013, - "learning_rate": 4.522542485937369e-05, - "loss": 0.3304, - "step": 263 - }, - { - "epoch": 1.0731707317073171, - "grad_norm": 1.636659026145935, - "learning_rate": 4.518720421458917e-05, - "loss": 0.5028, - "step": 264 - }, - { - "epoch": 1.0772357723577235, - "grad_norm": 1.4439018964767456, - "learning_rate": 4.514884748647179e-05, - "loss": 0.3801, - "step": 265 - }, - { - "epoch": 1.08130081300813, - "grad_norm": 1.1658079624176025, - "learning_rate": 4.5110354933586896e-05, - "loss": 0.323, - "step": 266 - }, - { - "epoch": 1.0853658536585367, - "grad_norm": 0.9779409170150757, - "learning_rate": 4.507172681541543e-05, - "loss": 0.1576, - "step": 267 - }, - { - "epoch": 1.089430894308943, - "grad_norm": 0.8524717688560486, - "learning_rate": 4.503296339235221e-05, - "loss": 0.2272, - "step": 268 - }, - { - "epoch": 1.0934959349593496, - "grad_norm": 1.1657898426055908, - "learning_rate": 4.4994064925704126e-05, - "loss": 0.2397, - "step": 269 - }, - { - "epoch": 1.0975609756097562, - "grad_norm": 1.5110620260238647, - "learning_rate": 4.495503167768842e-05, - "loss": 0.3064, - "step": 270 - }, - { - "epoch": 1.1016260162601625, - "grad_norm": 1.3554799556732178, - "learning_rate": 4.49158639114309e-05, - "loss": 0.3426, - "step": 271 - }, - { - "epoch": 1.1056910569105691, - "grad_norm": 1.5044909715652466, - "learning_rate": 4.4876561890964186e-05, - "loss": 0.2873, - "step": 272 - }, - { - "epoch": 1.1097560975609757, - "grad_norm": 1.1328275203704834, - "learning_rate": 4.483712588122589e-05, - "loss": 0.312, - "step": 273 - }, - { - "epoch": 1.113821138211382, - "grad_norm": 0.7558320760726929, - "learning_rate": 4.479755614805688e-05, - "loss": 0.2655, - "step": 274 - }, - { - "epoch": 1.1178861788617886, - "grad_norm": 0.8732352256774902, - "learning_rate": 4.4757852958199444e-05, - "loss": 0.2512, - "step": 275 - }, - { - "epoch": 1.1219512195121952, - "grad_norm": 0.8884730935096741, - "learning_rate": 4.471801657929551e-05, - "loss": 0.2408, - "step": 276 - }, - { - "epoch": 1.1260162601626016, - "grad_norm": 1.051373839378357, - "learning_rate": 4.467804727988485e-05, - "loss": 0.2649, - "step": 277 - }, - { - "epoch": 1.1300813008130082, - "grad_norm": 1.6665611267089844, - "learning_rate": 4.463794532940328e-05, - "loss": 0.4384, - "step": 278 - }, - { - "epoch": 1.1341463414634148, - "grad_norm": 1.5425509214401245, - "learning_rate": 4.459771099818079e-05, - "loss": 0.231, - "step": 279 - }, - { - "epoch": 1.1382113821138211, - "grad_norm": 1.1798183917999268, - "learning_rate": 4.455734455743977e-05, - "loss": 0.3653, - "step": 280 - }, - { - "epoch": 1.1422764227642277, - "grad_norm": 1.448278784751892, - "learning_rate": 4.45168462792932e-05, - "loss": 0.2448, - "step": 281 - }, - { - "epoch": 1.146341463414634, - "grad_norm": 1.0073964595794678, - "learning_rate": 4.447621643674275e-05, - "loss": 0.2575, - "step": 282 - }, - { - "epoch": 1.1504065040650406, - "grad_norm": 1.6564254760742188, - "learning_rate": 4.4435455303676973e-05, - "loss": 0.3116, - "step": 283 - }, - { - "epoch": 1.1544715447154472, - "grad_norm": 1.247767686843872, - "learning_rate": 4.439456315486949e-05, - "loss": 0.2656, - "step": 284 - }, - { - "epoch": 1.1585365853658536, - "grad_norm": 0.8479900360107422, - "learning_rate": 4.4353540265977064e-05, - "loss": 0.2477, - "step": 285 - }, - { - "epoch": 1.1626016260162602, - "grad_norm": 1.104665756225586, - "learning_rate": 4.431238691353784e-05, - "loss": 0.2855, - "step": 286 - }, - { - "epoch": 1.1666666666666667, - "grad_norm": 1.1042630672454834, - "learning_rate": 4.427110337496938e-05, - "loss": 0.2786, - "step": 287 - }, - { - "epoch": 1.170731707317073, - "grad_norm": 1.6606295108795166, - "learning_rate": 4.422968992856687e-05, - "loss": 0.3315, - "step": 288 - }, - { - "epoch": 1.1747967479674797, - "grad_norm": 1.2095733880996704, - "learning_rate": 4.41881468535012e-05, - "loss": 0.308, - "step": 289 - }, - { - "epoch": 1.1788617886178863, - "grad_norm": 0.9724107384681702, - "learning_rate": 4.4146474429817095e-05, - "loss": 0.1594, - "step": 290 - }, - { - "epoch": 1.1829268292682926, - "grad_norm": 1.1982440948486328, - "learning_rate": 4.410467293843123e-05, - "loss": 0.3459, - "step": 291 - }, - { - "epoch": 1.1869918699186992, - "grad_norm": 1.308335304260254, - "learning_rate": 4.406274266113034e-05, - "loss": 0.4625, - "step": 292 - }, - { - "epoch": 1.1910569105691058, - "grad_norm": 0.9014081358909607, - "learning_rate": 4.40206838805693e-05, - "loss": 0.3353, - "step": 293 - }, - { - "epoch": 1.1951219512195121, - "grad_norm": 1.061950922012329, - "learning_rate": 4.397849688026926e-05, - "loss": 0.3329, - "step": 294 - }, - { - "epoch": 1.1991869918699187, - "grad_norm": 1.4644041061401367, - "learning_rate": 4.3936181944615686e-05, - "loss": 0.3053, - "step": 295 - }, - { - "epoch": 1.203252032520325, - "grad_norm": 0.9609421491622925, - "learning_rate": 4.389373935885646e-05, - "loss": 0.274, - "step": 296 - }, - { - "epoch": 1.2073170731707317, - "grad_norm": 0.9803066849708557, - "learning_rate": 4.3851169409099986e-05, - "loss": 0.3487, - "step": 297 - }, - { - "epoch": 1.2113821138211383, - "grad_norm": 1.078660011291504, - "learning_rate": 4.380847238231322e-05, - "loss": 0.3663, - "step": 298 - }, - { - "epoch": 1.2154471544715446, - "grad_norm": 0.8618593811988831, - "learning_rate": 4.3765648566319755e-05, - "loss": 0.1843, - "step": 299 - }, - { - "epoch": 1.2195121951219512, - "grad_norm": 1.5458698272705078, - "learning_rate": 4.372269824979789e-05, - "loss": 0.4548, - "step": 300 - }, - { - "epoch": 1.2235772357723578, - "grad_norm": 0.9989164471626282, - "learning_rate": 4.367962172227866e-05, - "loss": 0.3196, - "step": 301 - }, - { - "epoch": 1.2276422764227641, - "grad_norm": 1.059915542602539, - "learning_rate": 4.36364192741439e-05, - "loss": 0.341, - "step": 302 - }, - { - "epoch": 1.2317073170731707, - "grad_norm": 1.0789270401000977, - "learning_rate": 4.359309119662429e-05, - "loss": 0.28, - "step": 303 - }, - { - "epoch": 1.2357723577235773, - "grad_norm": 1.2683236598968506, - "learning_rate": 4.354963778179738e-05, - "loss": 0.2494, - "step": 304 - }, - { - "epoch": 1.2398373983739837, - "grad_norm": 0.9698534607887268, - "learning_rate": 4.350605932258563e-05, - "loss": 0.3069, - "step": 305 - }, - { - "epoch": 1.2439024390243902, - "grad_norm": 1.1092990636825562, - "learning_rate": 4.346235611275443e-05, - "loss": 0.2409, - "step": 306 - }, - { - "epoch": 1.2479674796747968, - "grad_norm": 1.2444003820419312, - "learning_rate": 4.341852844691012e-05, - "loss": 0.3057, - "step": 307 - }, - { - "epoch": 1.2520325203252032, - "grad_norm": 1.0667630434036255, - "learning_rate": 4.337457662049803e-05, - "loss": 0.2668, - "step": 308 - }, - { - "epoch": 1.2560975609756098, - "grad_norm": 0.9309645891189575, - "learning_rate": 4.3330500929800434e-05, - "loss": 0.2398, - "step": 309 - }, - { - "epoch": 1.2601626016260163, - "grad_norm": 1.1863517761230469, - "learning_rate": 4.328630167193459e-05, - "loss": 0.2953, - "step": 310 - }, - { - "epoch": 1.2642276422764227, - "grad_norm": 1.8173768520355225, - "learning_rate": 4.324197914485075e-05, - "loss": 0.3598, - "step": 311 - }, - { - "epoch": 1.2682926829268293, - "grad_norm": 1.1182650327682495, - "learning_rate": 4.3197533647330115e-05, - "loss": 0.4198, - "step": 312 - }, - { - "epoch": 1.2723577235772359, - "grad_norm": 0.9192646741867065, - "learning_rate": 4.3152965478982836e-05, - "loss": 0.1167, - "step": 313 - }, - { - "epoch": 1.2764227642276422, - "grad_norm": 1.926855206489563, - "learning_rate": 4.3108274940246014e-05, - "loss": 0.4009, - "step": 314 - }, - { - "epoch": 1.2804878048780488, - "grad_norm": 1.554799199104309, - "learning_rate": 4.306346233238164e-05, - "loss": 0.3824, - "step": 315 - }, - { - "epoch": 1.2845528455284554, - "grad_norm": 1.2775733470916748, - "learning_rate": 4.301852795747458e-05, - "loss": 0.4012, - "step": 316 - }, - { - "epoch": 1.2886178861788617, - "grad_norm": 1.5684229135513306, - "learning_rate": 4.297347211843056e-05, - "loss": 0.2535, - "step": 317 - }, - { - "epoch": 1.2926829268292683, - "grad_norm": 0.8492239713668823, - "learning_rate": 4.292829511897409e-05, - "loss": 0.3606, - "step": 318 - }, - { - "epoch": 1.296747967479675, - "grad_norm": 1.5130596160888672, - "learning_rate": 4.288299726364643e-05, - "loss": 0.2879, - "step": 319 - }, - { - "epoch": 1.3008130081300813, - "grad_norm": 1.647362470626831, - "learning_rate": 4.283757885780353e-05, - "loss": 0.276, - "step": 320 - }, - { - "epoch": 1.3048780487804879, - "grad_norm": 1.0791544914245605, - "learning_rate": 4.2792040207614005e-05, - "loss": 0.2878, - "step": 321 - }, - { - "epoch": 1.3089430894308944, - "grad_norm": 1.4818476438522339, - "learning_rate": 4.274638162005703e-05, - "loss": 0.3626, - "step": 322 - }, - { - "epoch": 1.3130081300813008, - "grad_norm": 1.7319128513336182, - "learning_rate": 4.270060340292027e-05, - "loss": 0.3644, - "step": 323 - }, - { - "epoch": 1.3170731707317074, - "grad_norm": 0.876948893070221, - "learning_rate": 4.265470586479785e-05, - "loss": 0.1733, - "step": 324 - }, - { - "epoch": 1.321138211382114, - "grad_norm": 1.4998966455459595, - "learning_rate": 4.260868931508822e-05, - "loss": 0.1451, - "step": 325 - }, - { - "epoch": 1.3252032520325203, - "grad_norm": 1.1622909307479858, - "learning_rate": 4.256255406399213e-05, - "loss": 0.2888, - "step": 326 - }, - { - "epoch": 1.329268292682927, - "grad_norm": 1.2140052318572998, - "learning_rate": 4.251630042251047e-05, - "loss": 0.3105, - "step": 327 - }, - { - "epoch": 1.3333333333333333, - "grad_norm": 1.2368940114974976, - "learning_rate": 4.246992870244222e-05, - "loss": 0.1765, - "step": 328 - }, - { - "epoch": 1.3373983739837398, - "grad_norm": 1.0317000150680542, - "learning_rate": 4.242343921638234e-05, - "loss": 0.1933, - "step": 329 - }, - { - "epoch": 1.3414634146341464, - "grad_norm": 2.0966598987579346, - "learning_rate": 4.2376832277719645e-05, - "loss": 0.3714, - "step": 330 - }, - { - "epoch": 1.3455284552845528, - "grad_norm": 2.0586447715759277, - "learning_rate": 4.233010820063473e-05, - "loss": 0.3266, - "step": 331 - }, - { - "epoch": 1.3495934959349594, - "grad_norm": 1.8100645542144775, - "learning_rate": 4.22832673000978e-05, - "loss": 0.356, - "step": 332 - }, - { - "epoch": 1.3536585365853657, - "grad_norm": 1.4494227170944214, - "learning_rate": 4.2236309891866596e-05, - "loss": 0.3143, - "step": 333 - }, - { - "epoch": 1.3577235772357723, - "grad_norm": 1.3032536506652832, - "learning_rate": 4.218923629248425e-05, - "loss": 0.3549, - "step": 334 - }, - { - "epoch": 1.3617886178861789, - "grad_norm": 1.2196041345596313, - "learning_rate": 4.214204681927712e-05, - "loss": 0.3155, - "step": 335 - }, - { - "epoch": 1.3658536585365852, - "grad_norm": 1.2922784090042114, - "learning_rate": 4.2094741790352675e-05, - "loss": 0.3317, - "step": 336 - }, - { - "epoch": 1.3699186991869918, - "grad_norm": 1.225430965423584, - "learning_rate": 4.204732152459742e-05, - "loss": 0.2896, - "step": 337 - }, - { - "epoch": 1.3739837398373984, - "grad_norm": 1.4281582832336426, - "learning_rate": 4.199978634167458e-05, - "loss": 0.2185, - "step": 338 - }, - { - "epoch": 1.3780487804878048, - "grad_norm": 1.9603033065795898, - "learning_rate": 4.195213656202213e-05, - "loss": 0.2194, - "step": 339 - }, - { - "epoch": 1.3821138211382114, - "grad_norm": 1.3090780973434448, - "learning_rate": 4.1904372506850484e-05, - "loss": 0.3641, - "step": 340 - }, - { - "epoch": 1.386178861788618, - "grad_norm": 1.2736930847167969, - "learning_rate": 4.1856494498140454e-05, - "loss": 0.2402, - "step": 341 - }, - { - "epoch": 1.3902439024390243, - "grad_norm": 1.0292978286743164, - "learning_rate": 4.1808502858640975e-05, - "loss": 0.381, - "step": 342 - }, - { - "epoch": 1.3943089430894309, - "grad_norm": 1.4204421043395996, - "learning_rate": 4.176039791186699e-05, - "loss": 0.331, - "step": 343 - }, - { - "epoch": 1.3983739837398375, - "grad_norm": 1.2310943603515625, - "learning_rate": 4.171217998209726e-05, - "loss": 0.3087, - "step": 344 - }, - { - "epoch": 1.4024390243902438, - "grad_norm": 1.2446876764297485, - "learning_rate": 4.166384939437217e-05, - "loss": 0.3394, - "step": 345 - }, - { - "epoch": 1.4065040650406504, - "grad_norm": 0.8111627101898193, - "learning_rate": 4.161540647449154e-05, - "loss": 0.2152, - "step": 346 - }, - { - "epoch": 1.410569105691057, - "grad_norm": 0.9822183847427368, - "learning_rate": 4.156685154901242e-05, - "loss": 0.2689, - "step": 347 - }, - { - "epoch": 1.4146341463414633, - "grad_norm": 0.9851759672164917, - "learning_rate": 4.15181849452469e-05, - "loss": 0.301, - "step": 348 - }, - { - "epoch": 1.41869918699187, - "grad_norm": 1.1378535032272339, - "learning_rate": 4.146940699125992e-05, - "loss": 0.1754, - "step": 349 - }, - { - "epoch": 1.4227642276422765, - "grad_norm": 1.1213337182998657, - "learning_rate": 4.142051801586701e-05, - "loss": 0.2789, - "step": 350 - }, - { - "epoch": 1.4268292682926829, - "grad_norm": 0.6373035311698914, - "learning_rate": 4.137151834863213e-05, - "loss": 0.2283, - "step": 351 - }, - { - "epoch": 1.4308943089430894, - "grad_norm": 1.7028007507324219, - "learning_rate": 4.1322408319865395e-05, - "loss": 0.3415, - "step": 352 - }, - { - "epoch": 1.434959349593496, - "grad_norm": 0.9557713866233826, - "learning_rate": 4.127318826062091e-05, - "loss": 0.2119, - "step": 353 - }, - { - "epoch": 1.4390243902439024, - "grad_norm": 1.1030943393707275, - "learning_rate": 4.122385850269446e-05, - "loss": 0.29, - "step": 354 - }, - { - "epoch": 1.443089430894309, - "grad_norm": 1.0431095361709595, - "learning_rate": 4.1174419378621344e-05, - "loss": 0.2148, - "step": 355 - }, - { - "epoch": 1.4471544715447155, - "grad_norm": 2.2977609634399414, - "learning_rate": 4.1124871221674096e-05, - "loss": 0.371, - "step": 356 - }, - { - "epoch": 1.451219512195122, - "grad_norm": 1.7879778146743774, - "learning_rate": 4.107521436586027e-05, - "loss": 0.3384, - "step": 357 - }, - { - "epoch": 1.4552845528455285, - "grad_norm": 1.3325597047805786, - "learning_rate": 4.1025449145920124e-05, - "loss": 0.3312, - "step": 358 - }, - { - "epoch": 1.459349593495935, - "grad_norm": 1.0007076263427734, - "learning_rate": 4.097557589732445e-05, - "loss": 0.1632, - "step": 359 - }, - { - "epoch": 1.4634146341463414, - "grad_norm": 1.9280353784561157, - "learning_rate": 4.092559495627224e-05, - "loss": 0.2558, - "step": 360 - }, - { - "epoch": 1.467479674796748, - "grad_norm": 2.4024994373321533, - "learning_rate": 4.0875506659688465e-05, - "loss": 0.3127, - "step": 361 - }, - { - "epoch": 1.4715447154471546, - "grad_norm": 1.622728705406189, - "learning_rate": 4.082531134522176e-05, - "loss": 0.2918, - "step": 362 - }, - { - "epoch": 1.475609756097561, - "grad_norm": 1.539137601852417, - "learning_rate": 4.0775009351242215e-05, - "loss": 0.3047, - "step": 363 - }, - { - "epoch": 1.4796747967479675, - "grad_norm": 1.1157269477844238, - "learning_rate": 4.0724601016839024e-05, - "loss": 0.1799, - "step": 364 - }, - { - "epoch": 1.4837398373983741, - "grad_norm": 1.4390692710876465, - "learning_rate": 4.0674086681818235e-05, - "loss": 0.2958, - "step": 365 - }, - { - "epoch": 1.4878048780487805, - "grad_norm": 1.162617802619934, - "learning_rate": 4.062346668670046e-05, - "loss": 0.3242, - "step": 366 - }, - { - "epoch": 1.491869918699187, - "grad_norm": 1.1729458570480347, - "learning_rate": 4.0572741372718584e-05, - "loss": 0.2434, - "step": 367 - }, - { - "epoch": 1.4959349593495934, - "grad_norm": 1.053901195526123, - "learning_rate": 4.052191108181541e-05, - "loss": 0.2589, - "step": 368 - }, - { - "epoch": 1.5, - "grad_norm": 2.150946855545044, - "learning_rate": 4.047097615664145e-05, - "loss": 0.3882, - "step": 369 - }, - { - "epoch": 1.5040650406504064, - "grad_norm": 1.4245436191558838, - "learning_rate": 4.041993694055253e-05, - "loss": 0.2658, - "step": 370 - }, - { - "epoch": 1.5081300813008132, - "grad_norm": 1.7292096614837646, - "learning_rate": 4.0368793777607524e-05, - "loss": 0.3799, - "step": 371 - }, - { - "epoch": 1.5121951219512195, - "grad_norm": 1.3922419548034668, - "learning_rate": 4.031754701256601e-05, - "loss": 0.3255, - "step": 372 - }, - { - "epoch": 1.5162601626016259, - "grad_norm": 1.1905940771102905, - "learning_rate": 4.0266196990885955e-05, - "loss": 0.2776, - "step": 373 - }, - { - "epoch": 1.5203252032520327, - "grad_norm": 1.6942836046218872, - "learning_rate": 4.021474405872139e-05, - "loss": 0.2916, - "step": 374 - }, - { - "epoch": 1.524390243902439, - "grad_norm": 0.9484306573867798, - "learning_rate": 4.016318856292008e-05, - "loss": 0.2584, - "step": 375 - }, - { - "epoch": 1.5284552845528454, - "grad_norm": 1.167858362197876, - "learning_rate": 4.0111530851021164e-05, - "loss": 0.2383, - "step": 376 - }, - { - "epoch": 1.532520325203252, - "grad_norm": 1.0677193403244019, - "learning_rate": 4.005977127125282e-05, - "loss": 0.1705, - "step": 377 - }, - { - "epoch": 1.5365853658536586, - "grad_norm": 1.2955567836761475, - "learning_rate": 4.000791017252996e-05, - "loss": 0.2774, - "step": 378 - }, - { - "epoch": 1.540650406504065, - "grad_norm": 0.803046703338623, - "learning_rate": 3.995594790445181e-05, - "loss": 0.2324, - "step": 379 - }, - { - "epoch": 1.5447154471544715, - "grad_norm": 1.7316648960113525, - "learning_rate": 3.990388481729959e-05, - "loss": 0.3892, - "step": 380 - }, - { - "epoch": 1.548780487804878, - "grad_norm": 1.0649036169052124, - "learning_rate": 3.9851721262034156e-05, - "loss": 0.3569, - "step": 381 - }, - { - "epoch": 1.5528455284552845, - "grad_norm": 1.4234437942504883, - "learning_rate": 3.979945759029363e-05, - "loss": 0.2976, - "step": 382 - }, - { - "epoch": 1.556910569105691, - "grad_norm": 2.0413475036621094, - "learning_rate": 3.9747094154391014e-05, - "loss": 0.3385, - "step": 383 - }, - { - "epoch": 1.5609756097560976, - "grad_norm": 1.6913297176361084, - "learning_rate": 3.969463130731183e-05, - "loss": 0.3915, - "step": 384 - }, - { - "epoch": 1.565040650406504, - "grad_norm": 1.508286476135254, - "learning_rate": 3.9642069402711745e-05, - "loss": 0.2885, - "step": 385 - }, - { - "epoch": 1.5691056910569106, - "grad_norm": 0.9865254759788513, - "learning_rate": 3.958940879491418e-05, - "loss": 0.3157, - "step": 386 - }, - { - "epoch": 1.5731707317073171, - "grad_norm": 0.9907522201538086, - "learning_rate": 3.953664983890791e-05, - "loss": 0.3143, - "step": 387 - }, - { - "epoch": 1.5772357723577235, - "grad_norm": 1.5813465118408203, - "learning_rate": 3.948379289034469e-05, - "loss": 0.3221, - "step": 388 - }, - { - "epoch": 1.58130081300813, - "grad_norm": 1.3218395709991455, - "learning_rate": 3.9430838305536845e-05, - "loss": 0.2339, - "step": 389 - }, - { - "epoch": 1.5853658536585367, - "grad_norm": 1.3569320440292358, - "learning_rate": 3.937778644145488e-05, - "loss": 0.3141, - "step": 390 - }, - { - "epoch": 1.589430894308943, - "grad_norm": 0.9248982071876526, - "learning_rate": 3.9324637655725055e-05, - "loss": 0.2329, - "step": 391 - }, - { - "epoch": 1.5934959349593496, - "grad_norm": 1.2585185766220093, - "learning_rate": 3.9271392306627e-05, - "loss": 0.2654, - "step": 392 - }, - { - "epoch": 1.5975609756097562, - "grad_norm": 0.9639501571655273, - "learning_rate": 3.9218050753091274e-05, - "loss": 0.1646, - "step": 393 - }, - { - "epoch": 1.6016260162601625, - "grad_norm": 1.0362088680267334, - "learning_rate": 3.9164613354696954e-05, - "loss": 0.2212, - "step": 394 - }, - { - "epoch": 1.6056910569105691, - "grad_norm": 1.3693809509277344, - "learning_rate": 3.911108047166924e-05, - "loss": 0.3016, - "step": 395 - }, - { - "epoch": 1.6097560975609757, - "grad_norm": 2.326934337615967, - "learning_rate": 3.905745246487695e-05, - "loss": 0.3291, - "step": 396 - }, - { - "epoch": 1.613821138211382, - "grad_norm": 1.4838097095489502, - "learning_rate": 3.9003729695830194e-05, - "loss": 0.3211, - "step": 397 - }, - { - "epoch": 1.6178861788617886, - "grad_norm": 1.4616005420684814, - "learning_rate": 3.894991252667785e-05, - "loss": 0.3385, - "step": 398 - }, - { - "epoch": 1.6219512195121952, - "grad_norm": 1.5243289470672607, - "learning_rate": 3.8896001320205143e-05, - "loss": 0.2914, - "step": 399 - }, - { - "epoch": 1.6260162601626016, - "grad_norm": 1.3103337287902832, - "learning_rate": 3.884199643983125e-05, - "loss": 0.3144, - "step": 400 - }, - { - "epoch": 1.6300813008130082, - "grad_norm": 1.5412578582763672, - "learning_rate": 3.878789824960677e-05, - "loss": 0.2287, - "step": 401 - }, - { - "epoch": 1.6341463414634148, - "grad_norm": 1.3176844120025635, - "learning_rate": 3.8733707114211325e-05, - "loss": 0.245, - "step": 402 - }, - { - "epoch": 1.6382113821138211, - "grad_norm": 2.1294362545013428, - "learning_rate": 3.86794233989511e-05, - "loss": 0.404, - "step": 403 - }, - { - "epoch": 1.6422764227642277, - "grad_norm": 0.8553035259246826, - "learning_rate": 3.862504746975635e-05, - "loss": 0.2651, - "step": 404 - }, - { - "epoch": 1.6463414634146343, - "grad_norm": 1.7764294147491455, - "learning_rate": 3.8570579693178956e-05, - "loss": 0.3413, - "step": 405 - }, - { - "epoch": 1.6504065040650406, - "grad_norm": 2.001711845397949, - "learning_rate": 3.851602043638994e-05, - "loss": 0.4444, - "step": 406 - }, - { - "epoch": 1.654471544715447, - "grad_norm": 1.016660213470459, - "learning_rate": 3.846137006717701e-05, - "loss": 0.2854, - "step": 407 - }, - { - "epoch": 1.6585365853658538, - "grad_norm": 1.6637189388275146, - "learning_rate": 3.840662895394208e-05, - "loss": 0.216, - "step": 408 - }, - { - "epoch": 1.6626016260162602, - "grad_norm": 1.7360148429870605, - "learning_rate": 3.8351797465698754e-05, - "loss": 0.3957, - "step": 409 - }, - { - "epoch": 1.6666666666666665, - "grad_norm": 1.5947576761245728, - "learning_rate": 3.829687597206986e-05, - "loss": 0.2025, - "step": 410 - }, - { - "epoch": 1.6707317073170733, - "grad_norm": 1.154308795928955, - "learning_rate": 3.824186484328497e-05, - "loss": 0.2847, - "step": 411 - }, - { - "epoch": 1.6747967479674797, - "grad_norm": 1.624076008796692, - "learning_rate": 3.818676445017788e-05, - "loss": 0.2604, - "step": 412 - }, - { - "epoch": 1.678861788617886, - "grad_norm": 1.4424537420272827, - "learning_rate": 3.8131575164184155e-05, - "loss": 0.3768, - "step": 413 - }, - { - "epoch": 1.6829268292682928, - "grad_norm": 0.9721991419792175, - "learning_rate": 3.8076297357338554e-05, - "loss": 0.1369, - "step": 414 - }, - { - "epoch": 1.6869918699186992, - "grad_norm": 1.0535129308700562, - "learning_rate": 3.8020931402272574e-05, - "loss": 0.2278, - "step": 415 - }, - { - "epoch": 1.6910569105691056, - "grad_norm": 1.168233036994934, - "learning_rate": 3.796547767221194e-05, - "loss": 0.2875, - "step": 416 - }, - { - "epoch": 1.6951219512195121, - "grad_norm": 1.0156890153884888, - "learning_rate": 3.790993654097405e-05, - "loss": 0.2435, - "step": 417 - }, - { - "epoch": 1.6991869918699187, - "grad_norm": 1.2593859434127808, - "learning_rate": 3.7854308382965506e-05, - "loss": 0.2677, - "step": 418 - }, - { - "epoch": 1.703252032520325, - "grad_norm": 1.8139991760253906, - "learning_rate": 3.779859357317953e-05, - "loss": 0.3816, - "step": 419 - }, - { - "epoch": 1.7073170731707317, - "grad_norm": 2.1028025150299072, - "learning_rate": 3.774279248719351e-05, - "loss": 0.3885, - "step": 420 - }, - { - "epoch": 1.7113821138211383, - "grad_norm": 1.4801568984985352, - "learning_rate": 3.768690550116639e-05, - "loss": 0.3025, - "step": 421 - }, - { - "epoch": 1.7154471544715446, - "grad_norm": 1.0483862161636353, - "learning_rate": 3.763093299183621e-05, - "loss": 0.2407, - "step": 422 - }, - { - "epoch": 1.7195121951219512, - "grad_norm": 2.1429080963134766, - "learning_rate": 3.757487533651748e-05, - "loss": 0.4695, - "step": 423 - }, - { - "epoch": 1.7235772357723578, - "grad_norm": 0.9022377729415894, - "learning_rate": 3.751873291309873e-05, - "loss": 0.1927, - "step": 424 - }, - { - "epoch": 1.7276422764227641, - "grad_norm": 1.051822543144226, - "learning_rate": 3.7462506100039896e-05, - "loss": 0.3144, - "step": 425 - }, - { - "epoch": 1.7317073170731707, - "grad_norm": 2.0977585315704346, - "learning_rate": 3.7406195276369796e-05, - "loss": 0.2881, - "step": 426 - }, - { - "epoch": 1.7357723577235773, - "grad_norm": 1.6701133251190186, - "learning_rate": 3.7349800821683554e-05, - "loss": 0.2694, - "step": 427 - }, - { - "epoch": 1.7398373983739837, - "grad_norm": 1.407677412033081, - "learning_rate": 3.72933231161401e-05, - "loss": 0.2166, - "step": 428 - }, - { - "epoch": 1.7439024390243902, - "grad_norm": 1.3758926391601562, - "learning_rate": 3.723676254045951e-05, - "loss": 0.3866, - "step": 429 - }, - { - "epoch": 1.7479674796747968, - "grad_norm": 1.3261126279830933, - "learning_rate": 3.718011947592051e-05, - "loss": 0.2749, - "step": 430 - }, - { - "epoch": 1.7520325203252032, - "grad_norm": 1.12800931930542, - "learning_rate": 3.712339430435792e-05, - "loss": 0.3511, - "step": 431 - }, - { - "epoch": 1.7560975609756098, - "grad_norm": 1.327278971672058, - "learning_rate": 3.706658740816001e-05, - "loss": 0.339, - "step": 432 - }, - { - "epoch": 1.7601626016260163, - "grad_norm": 1.5774999856948853, - "learning_rate": 3.7009699170265985e-05, - "loss": 0.306, - "step": 433 - }, - { - "epoch": 1.7642276422764227, - "grad_norm": 1.0912119150161743, - "learning_rate": 3.695272997416336e-05, - "loss": 0.3298, - "step": 434 - }, - { - "epoch": 1.7682926829268293, - "grad_norm": 1.1372374296188354, - "learning_rate": 3.6895680203885416e-05, - "loss": 0.2817, - "step": 435 - }, - { - "epoch": 1.7723577235772359, - "grad_norm": 0.8853081464767456, - "learning_rate": 3.6838550244008576e-05, - "loss": 0.1797, - "step": 436 - }, - { - "epoch": 1.7764227642276422, - "grad_norm": 1.3123356103897095, - "learning_rate": 3.678134047964983e-05, - "loss": 0.2309, - "step": 437 - }, - { - "epoch": 1.7804878048780488, - "grad_norm": 1.149476408958435, - "learning_rate": 3.672405129646414e-05, - "loss": 0.3305, - "step": 438 - }, - { - "epoch": 1.7845528455284554, - "grad_norm": 1.0130585432052612, - "learning_rate": 3.6666683080641846e-05, - "loss": 0.2867, - "step": 439 - }, - { - "epoch": 1.7886178861788617, - "grad_norm": 1.1584160327911377, - "learning_rate": 3.660923621890601e-05, - "loss": 0.3451, - "step": 440 - }, - { - "epoch": 1.7926829268292683, - "grad_norm": 1.0909764766693115, - "learning_rate": 3.6551711098509906e-05, - "loss": 0.2697, - "step": 441 - }, - { - "epoch": 1.796747967479675, - "grad_norm": 1.3893420696258545, - "learning_rate": 3.649410810723431e-05, - "loss": 0.316, - "step": 442 - }, - { - "epoch": 1.8008130081300813, - "grad_norm": 1.4060670137405396, - "learning_rate": 3.643642763338497e-05, - "loss": 0.2217, - "step": 443 - }, - { - "epoch": 1.8048780487804879, - "grad_norm": 1.2033827304840088, - "learning_rate": 3.6378670065789905e-05, - "loss": 0.3251, - "step": 444 - }, - { - "epoch": 1.8089430894308944, - "grad_norm": 1.1884338855743408, - "learning_rate": 3.632083579379687e-05, - "loss": 0.2586, - "step": 445 - }, - { - "epoch": 1.8130081300813008, - "grad_norm": 1.5869964361190796, - "learning_rate": 3.626292520727067e-05, - "loss": 0.3504, - "step": 446 - }, - { - "epoch": 1.8170731707317072, - "grad_norm": 1.143086314201355, - "learning_rate": 3.620493869659055e-05, - "loss": 0.2767, - "step": 447 - }, - { - "epoch": 1.821138211382114, - "grad_norm": 1.1968578100204468, - "learning_rate": 3.6146876652647585e-05, - "loss": 0.2467, - "step": 448 - }, - { - "epoch": 1.8252032520325203, - "grad_norm": 1.1119422912597656, - "learning_rate": 3.6088739466841984e-05, - "loss": 0.1919, - "step": 449 - }, - { - "epoch": 1.8292682926829267, - "grad_norm": 1.8164317607879639, - "learning_rate": 3.603052753108053e-05, - "loss": 0.3848, - "step": 450 - }, - { - "epoch": 1.8333333333333335, - "grad_norm": 1.4222265481948853, - "learning_rate": 3.59722412377739e-05, - "loss": 0.2722, - "step": 451 - }, - { - "epoch": 1.8373983739837398, - "grad_norm": 0.9048798084259033, - "learning_rate": 3.591388097983398e-05, - "loss": 0.238, - "step": 452 - }, - { - "epoch": 1.8414634146341462, - "grad_norm": 1.4711002111434937, - "learning_rate": 3.585544715067131e-05, - "loss": 0.3519, - "step": 453 - }, - { - "epoch": 1.845528455284553, - "grad_norm": 0.9332497715950012, - "learning_rate": 3.5796940144192335e-05, - "loss": 0.2304, - "step": 454 - }, - { - "epoch": 1.8495934959349594, - "grad_norm": 1.1078135967254639, - "learning_rate": 3.5738360354796833e-05, - "loss": 0.1427, - "step": 455 - }, - { - "epoch": 1.8536585365853657, - "grad_norm": 0.8883565068244934, - "learning_rate": 3.567970817737518e-05, - "loss": 0.174, - "step": 456 - }, - { - "epoch": 1.8577235772357723, - "grad_norm": 1.1785696744918823, - "learning_rate": 3.562098400730575e-05, - "loss": 0.2507, - "step": 457 - }, - { - "epoch": 1.8617886178861789, - "grad_norm": 1.9816657304763794, - "learning_rate": 3.55621882404522e-05, - "loss": 0.468, - "step": 458 - }, - { - "epoch": 1.8658536585365852, - "grad_norm": 1.2859241962432861, - "learning_rate": 3.550332127316085e-05, - "loss": 0.2465, - "step": 459 - }, - { - "epoch": 1.8699186991869918, - "grad_norm": 1.491979718208313, - "learning_rate": 3.544438350225799e-05, - "loss": 0.3593, - "step": 460 - }, - { - "epoch": 1.8739837398373984, - "grad_norm": 0.8754283785820007, - "learning_rate": 3.5385375325047166e-05, - "loss": 0.2199, - "step": 461 - }, - { - "epoch": 1.8780487804878048, - "grad_norm": 0.8306921124458313, - "learning_rate": 3.5326297139306575e-05, - "loss": 0.1653, - "step": 462 - }, - { - "epoch": 1.8821138211382114, - "grad_norm": 0.8698571920394897, - "learning_rate": 3.5267149343286327e-05, - "loss": 0.2443, - "step": 463 - }, - { - "epoch": 1.886178861788618, - "grad_norm": 1.2175384759902954, - "learning_rate": 3.5207932335705794e-05, - "loss": 0.2664, - "step": 464 - }, - { - "epoch": 1.8902439024390243, - "grad_norm": 1.4111195802688599, - "learning_rate": 3.514864651575089e-05, - "loss": 0.4547, - "step": 465 - }, - { - "epoch": 1.8943089430894309, - "grad_norm": 1.2638578414916992, - "learning_rate": 3.508929228307142e-05, - "loss": 0.2708, - "step": 466 - }, - { - "epoch": 1.8983739837398375, - "grad_norm": 1.2739440202713013, - "learning_rate": 3.502987003777833e-05, - "loss": 0.2167, - "step": 467 - }, - { - "epoch": 1.9024390243902438, - "grad_norm": 1.775568962097168, - "learning_rate": 3.497038018044109e-05, - "loss": 0.3437, - "step": 468 - }, - { - "epoch": 1.9065040650406504, - "grad_norm": 1.1396230459213257, - "learning_rate": 3.49108231120849e-05, - "loss": 0.3199, - "step": 469 - }, - { - "epoch": 1.910569105691057, - "grad_norm": 0.9509851932525635, - "learning_rate": 3.485119923418807e-05, - "loss": 0.292, - "step": 470 - }, - { - "epoch": 1.9146341463414633, - "grad_norm": 1.5866367816925049, - "learning_rate": 3.479150894867926e-05, - "loss": 0.3416, - "step": 471 - }, - { - "epoch": 1.91869918699187, - "grad_norm": 1.720947027206421, - "learning_rate": 3.4731752657934794e-05, - "loss": 0.3043, - "step": 472 - }, - { - "epoch": 1.9227642276422765, - "grad_norm": 1.6866365671157837, - "learning_rate": 3.467193076477594e-05, - "loss": 0.328, - "step": 473 - }, - { - "epoch": 1.9268292682926829, - "grad_norm": 0.9281228184700012, - "learning_rate": 3.461204367246619e-05, - "loss": 0.3105, - "step": 474 - }, - { - "epoch": 1.9308943089430894, - "grad_norm": 1.0874384641647339, - "learning_rate": 3.4552091784708554e-05, - "loss": 0.3374, - "step": 475 - }, - { - "epoch": 1.934959349593496, - "grad_norm": 1.6240037679672241, - "learning_rate": 3.449207550564285e-05, - "loss": 0.2687, - "step": 476 - }, - { - "epoch": 1.9390243902439024, - "grad_norm": 1.4694489240646362, - "learning_rate": 3.443199523984293e-05, - "loss": 0.2701, - "step": 477 - }, - { - "epoch": 1.943089430894309, - "grad_norm": 0.9504182934761047, - "learning_rate": 3.437185139231402e-05, - "loss": 0.2796, - "step": 478 - }, - { - "epoch": 1.9471544715447155, - "grad_norm": 0.9461456537246704, - "learning_rate": 3.431164436848989e-05, - "loss": 0.275, - "step": 479 - }, - { - "epoch": 1.951219512195122, - "grad_norm": 1.743233561515808, - "learning_rate": 3.425137457423029e-05, - "loss": 0.2852, - "step": 480 - }, - { - "epoch": 1.9552845528455285, - "grad_norm": 1.6488287448883057, - "learning_rate": 3.4191042415818e-05, - "loss": 0.325, - "step": 481 - }, - { - "epoch": 1.959349593495935, - "grad_norm": 1.2071924209594727, - "learning_rate": 3.4130648299956294e-05, - "loss": 0.2893, - "step": 482 - }, - { - "epoch": 1.9634146341463414, - "grad_norm": 1.5142889022827148, - "learning_rate": 3.4070192633766025e-05, - "loss": 0.3381, - "step": 483 - }, - { - "epoch": 1.967479674796748, - "grad_norm": 0.9330607652664185, - "learning_rate": 3.400967582478303e-05, - "loss": 0.2538, - "step": 484 - }, - { - "epoch": 1.9715447154471546, - "grad_norm": 1.4295389652252197, - "learning_rate": 3.394909828095526e-05, - "loss": 0.2301, - "step": 485 - }, - { - "epoch": 1.975609756097561, - "grad_norm": 1.0548290014266968, - "learning_rate": 3.388846041064012e-05, - "loss": 0.2424, - "step": 486 - }, - { - "epoch": 1.9796747967479673, - "grad_norm": 1.6041169166564941, - "learning_rate": 3.3827762622601665e-05, - "loss": 0.1775, - "step": 487 - }, - { - "epoch": 1.9837398373983741, - "grad_norm": 1.4730509519577026, - "learning_rate": 3.376700532600786e-05, - "loss": 0.3907, - "step": 488 - }, - { - "epoch": 1.9878048780487805, - "grad_norm": 1.50563383102417, - "learning_rate": 3.3706188930427827e-05, - "loss": 0.3296, - "step": 489 - }, - { - "epoch": 1.9918699186991868, - "grad_norm": 0.8891299366950989, - "learning_rate": 3.3645313845829066e-05, - "loss": 0.2303, - "step": 490 - }, - { - "epoch": 1.9959349593495936, - "grad_norm": 1.1380308866500854, - "learning_rate": 3.3584380482574716e-05, - "loss": 0.2577, - "step": 491 - }, - { - "epoch": 2.0, - "grad_norm": 1.3352149724960327, - "learning_rate": 3.3523389251420766e-05, - "loss": 0.296, - "step": 492 - }, - { - "epoch": 2.0, - "eval_accuracy": 0.904822496500827, - "eval_loss": 0.2880232334136963, - "eval_precision_bio": 0.5375, - "eval_precision_cs": 0.9193591101694916, - "eval_precision_math": 0.564625850340136, - "eval_precision_physics": 0.525, - "eval_recall_bio": 0.24431818181818182, - "eval_recall_cs": 0.9834277620396601, - "eval_recall_math": 0.19483568075117372, - "eval_recall_physics": 0.2131979695431472, - "eval_runtime": 31.379, - "eval_samples_per_second": 250.454, - "eval_steps_per_second": 31.327, - "step": 492 - }, - { - "epoch": 2.0040650406504064, - "grad_norm": 1.1771836280822754, - "learning_rate": 3.3462340563513316e-05, - "loss": 0.2163, - "step": 493 - }, - { - "epoch": 2.008130081300813, - "grad_norm": 1.0600090026855469, - "learning_rate": 3.3401234830385756e-05, - "loss": 0.2669, - "step": 494 - }, - { - "epoch": 2.0121951219512195, - "grad_norm": 1.6154742240905762, - "learning_rate": 3.334007246395605e-05, - "loss": 0.1257, - "step": 495 - }, - { - "epoch": 2.016260162601626, - "grad_norm": 0.8353559374809265, - "learning_rate": 3.327885387652391e-05, - "loss": 0.1779, - "step": 496 - }, - { - "epoch": 2.0203252032520327, - "grad_norm": 1.0554636716842651, - "learning_rate": 3.321757948076806e-05, - "loss": 0.2709, - "step": 497 - }, - { - "epoch": 2.024390243902439, - "grad_norm": 1.6010822057724, - "learning_rate": 3.3156249689743415e-05, - "loss": 0.3554, - "step": 498 - }, - { - "epoch": 2.0284552845528454, - "grad_norm": 1.8335533142089844, - "learning_rate": 3.30948649168783e-05, - "loss": 0.2908, - "step": 499 - }, - { - "epoch": 2.032520325203252, - "grad_norm": 1.5793169736862183, - "learning_rate": 3.303342557597171e-05, - "loss": 0.3799, - "step": 500 - }, - { - "epoch": 2.0365853658536586, - "grad_norm": 1.5004339218139648, - "learning_rate": 3.297193208119047e-05, - "loss": 0.2924, - "step": 501 - }, - { - "epoch": 2.040650406504065, - "grad_norm": 1.5615473985671997, - "learning_rate": 3.2910384847066455e-05, - "loss": 0.2223, - "step": 502 - }, - { - "epoch": 2.0447154471544717, - "grad_norm": 1.212364912033081, - "learning_rate": 3.2848784288493804e-05, - "loss": 0.2501, - "step": 503 - }, - { - "epoch": 2.048780487804878, - "grad_norm": 1.2516722679138184, - "learning_rate": 3.278713082072613e-05, - "loss": 0.2494, - "step": 504 - }, - { - "epoch": 2.0528455284552845, - "grad_norm": 1.1285215616226196, - "learning_rate": 3.272542485937369e-05, - "loss": 0.2982, - "step": 505 - }, - { - "epoch": 2.0569105691056913, - "grad_norm": 1.323266863822937, - "learning_rate": 3.266366682040063e-05, - "loss": 0.3391, - "step": 506 - }, - { - "epoch": 2.0609756097560976, - "grad_norm": 0.9063870310783386, - "learning_rate": 3.2601857120122126e-05, - "loss": 0.2772, - "step": 507 - }, - { - "epoch": 2.065040650406504, - "grad_norm": 0.9748317003250122, - "learning_rate": 3.2539996175201636e-05, - "loss": 0.2312, - "step": 508 - }, - { - "epoch": 2.069105691056911, - "grad_norm": 1.2283767461776733, - "learning_rate": 3.2478084402648034e-05, - "loss": 0.342, - "step": 509 - }, - { - "epoch": 2.073170731707317, - "grad_norm": 1.0281082391738892, - "learning_rate": 3.2416122219812846e-05, - "loss": 0.1575, - "step": 510 - }, - { - "epoch": 2.0772357723577235, - "grad_norm": 0.981675386428833, - "learning_rate": 3.235411004438741e-05, - "loss": 0.209, - "step": 511 - }, - { - "epoch": 2.08130081300813, - "grad_norm": 1.0950697660446167, - "learning_rate": 3.229204829440007e-05, - "loss": 0.2137, - "step": 512 - }, - { - "epoch": 2.0853658536585367, - "grad_norm": 1.3254408836364746, - "learning_rate": 3.222993738821335e-05, - "loss": 0.2812, - "step": 513 - }, - { - "epoch": 2.089430894308943, - "grad_norm": 1.6176317930221558, - "learning_rate": 3.216777774452114e-05, - "loss": 0.2861, - "step": 514 - }, - { - "epoch": 2.0934959349593494, - "grad_norm": 1.0856714248657227, - "learning_rate": 3.2105569782345896e-05, - "loss": 0.2325, - "step": 515 - }, - { - "epoch": 2.097560975609756, - "grad_norm": 1.0221620798110962, - "learning_rate": 3.2043313921035743e-05, - "loss": 0.1455, - "step": 516 - }, - { - "epoch": 2.1016260162601625, - "grad_norm": 1.3463088274002075, - "learning_rate": 3.198101058026174e-05, - "loss": 0.2173, - "step": 517 - }, - { - "epoch": 2.105691056910569, - "grad_norm": 1.1067454814910889, - "learning_rate": 3.1918660180015e-05, - "loss": 0.1904, - "step": 518 - }, - { - "epoch": 2.1097560975609757, - "grad_norm": 2.5507760047912598, - "learning_rate": 3.185626314060386e-05, - "loss": 0.363, - "step": 519 - }, - { - "epoch": 2.113821138211382, - "grad_norm": 0.9818494319915771, - "learning_rate": 3.179381988265104e-05, - "loss": 0.2421, - "step": 520 - }, - { - "epoch": 2.1178861788617884, - "grad_norm": 1.2514492273330688, - "learning_rate": 3.1731330827090865e-05, - "loss": 0.3398, - "step": 521 - }, - { - "epoch": 2.1219512195121952, - "grad_norm": 0.9701881408691406, - "learning_rate": 3.166879639516634e-05, - "loss": 0.2135, - "step": 522 - }, - { - "epoch": 2.1260162601626016, - "grad_norm": 3.096705198287964, - "learning_rate": 3.160621700842638e-05, - "loss": 0.3802, - "step": 523 - }, - { - "epoch": 2.130081300813008, - "grad_norm": 1.1816494464874268, - "learning_rate": 3.154359308872294e-05, - "loss": 0.286, - "step": 524 - }, - { - "epoch": 2.1341463414634148, - "grad_norm": 1.1936464309692383, - "learning_rate": 3.148092505820817e-05, - "loss": 0.2679, - "step": 525 - }, - { - "epoch": 2.138211382113821, - "grad_norm": 1.253594994544983, - "learning_rate": 3.141821333933158e-05, - "loss": 0.342, - "step": 526 - }, - { - "epoch": 2.1422764227642275, - "grad_norm": 1.218159794807434, - "learning_rate": 3.135545835483718e-05, - "loss": 0.2234, - "step": 527 - }, - { - "epoch": 2.1463414634146343, - "grad_norm": 1.3563333749771118, - "learning_rate": 3.129266052776063e-05, - "loss": 0.2527, - "step": 528 - }, - { - "epoch": 2.1504065040650406, - "grad_norm": 1.1279078722000122, - "learning_rate": 3.122982028142642e-05, - "loss": 0.2444, - "step": 529 - }, - { - "epoch": 2.154471544715447, - "grad_norm": 1.017080545425415, - "learning_rate": 3.116693803944497e-05, - "loss": 0.2082, - "step": 530 - }, - { - "epoch": 2.158536585365854, - "grad_norm": 1.1646355390548706, - "learning_rate": 3.110401422570979e-05, - "loss": 0.2482, - "step": 531 - }, - { - "epoch": 2.16260162601626, - "grad_norm": 1.3241193294525146, - "learning_rate": 3.1041049264394643e-05, - "loss": 0.361, - "step": 532 - }, - { - "epoch": 2.1666666666666665, - "grad_norm": 1.3235691785812378, - "learning_rate": 3.0978043579950684e-05, - "loss": 0.2228, - "step": 533 - }, - { - "epoch": 2.1707317073170733, - "grad_norm": 1.1817597150802612, - "learning_rate": 3.0914997597103547e-05, - "loss": 0.2328, - "step": 534 - }, - { - "epoch": 2.1747967479674797, - "grad_norm": 1.2626383304595947, - "learning_rate": 3.085191174085055e-05, - "loss": 0.2, - "step": 535 - }, - { - "epoch": 2.178861788617886, - "grad_norm": 0.8942603468894958, - "learning_rate": 3.078878643645778e-05, - "loss": 0.2555, - "step": 536 - }, - { - "epoch": 2.182926829268293, - "grad_norm": 1.488521695137024, - "learning_rate": 3.072562210945729e-05, - "loss": 0.2529, - "step": 537 - }, - { - "epoch": 2.186991869918699, - "grad_norm": 1.11421537399292, - "learning_rate": 3.0662419185644115e-05, - "loss": 0.2518, - "step": 538 - }, - { - "epoch": 2.1910569105691056, - "grad_norm": 0.8021209836006165, - "learning_rate": 3.0599178091073535e-05, - "loss": 0.1574, - "step": 539 - }, - { - "epoch": 2.1951219512195124, - "grad_norm": 1.1324124336242676, - "learning_rate": 3.053589925205812e-05, - "loss": 0.2307, - "step": 540 - }, - { - "epoch": 2.1991869918699187, - "grad_norm": 1.401986002922058, - "learning_rate": 3.0472583095164874e-05, - "loss": 0.2175, - "step": 541 - }, - { - "epoch": 2.203252032520325, - "grad_norm": 1.2225003242492676, - "learning_rate": 3.040923004721237e-05, - "loss": 0.1351, - "step": 542 - }, - { - "epoch": 2.207317073170732, - "grad_norm": 1.0290645360946655, - "learning_rate": 3.0345840535267868e-05, - "loss": 0.1928, - "step": 543 - }, - { - "epoch": 2.2113821138211383, - "grad_norm": 2.2555384635925293, - "learning_rate": 3.028241498664442e-05, - "loss": 0.363, - "step": 544 - }, - { - "epoch": 2.2154471544715446, - "grad_norm": 3.320665121078491, - "learning_rate": 3.0218953828898017e-05, - "loss": 0.4769, - "step": 545 - }, - { - "epoch": 2.2195121951219514, - "grad_norm": 1.4269710779190063, - "learning_rate": 3.0155457489824705e-05, - "loss": 0.3227, - "step": 546 - }, - { - "epoch": 2.2235772357723578, - "grad_norm": 1.7438057661056519, - "learning_rate": 3.0091926397457664e-05, - "loss": 0.286, - "step": 547 - }, - { - "epoch": 2.227642276422764, - "grad_norm": 1.379206895828247, - "learning_rate": 3.0028360980064353e-05, - "loss": 0.1981, - "step": 548 - }, - { - "epoch": 2.231707317073171, - "grad_norm": 1.8639917373657227, - "learning_rate": 2.996476166614364e-05, - "loss": 0.2533, - "step": 549 - }, - { - "epoch": 2.2357723577235773, - "grad_norm": 1.671952486038208, - "learning_rate": 2.9901128884422863e-05, - "loss": 0.3258, - "step": 550 - }, - { - "epoch": 2.2398373983739837, - "grad_norm": 1.4555351734161377, - "learning_rate": 2.9837463063854994e-05, - "loss": 0.1825, - "step": 551 - }, - { - "epoch": 2.2439024390243905, - "grad_norm": 1.3574374914169312, - "learning_rate": 2.9773764633615703e-05, - "loss": 0.1799, - "step": 552 - }, - { - "epoch": 2.247967479674797, - "grad_norm": 1.5757508277893066, - "learning_rate": 2.9710034023100498e-05, - "loss": 0.2487, - "step": 553 - }, - { - "epoch": 2.252032520325203, - "grad_norm": 1.8196839094161987, - "learning_rate": 2.96462716619218e-05, - "loss": 0.2208, - "step": 554 - }, - { - "epoch": 2.2560975609756095, - "grad_norm": 1.6672518253326416, - "learning_rate": 2.9582477979906088e-05, - "loss": 0.2563, - "step": 555 - }, - { - "epoch": 2.2601626016260163, - "grad_norm": 1.7229293584823608, - "learning_rate": 2.951865340709095e-05, - "loss": 0.2198, - "step": 556 - }, - { - "epoch": 2.2642276422764227, - "grad_norm": 1.8526707887649536, - "learning_rate": 2.9454798373722238e-05, - "loss": 0.3778, - "step": 557 - }, - { - "epoch": 2.2682926829268295, - "grad_norm": 1.3256959915161133, - "learning_rate": 2.9390913310251112e-05, - "loss": 0.221, - "step": 558 - }, - { - "epoch": 2.272357723577236, - "grad_norm": 1.8876243829727173, - "learning_rate": 2.9326998647331184e-05, - "loss": 0.3361, - "step": 559 - }, - { - "epoch": 2.2764227642276422, - "grad_norm": 1.489292860031128, - "learning_rate": 2.92630548158156e-05, - "loss": 0.2503, - "step": 560 - }, - { - "epoch": 2.2804878048780486, - "grad_norm": 1.4212514162063599, - "learning_rate": 2.9199082246754122e-05, - "loss": 0.2541, - "step": 561 - }, - { - "epoch": 2.2845528455284554, - "grad_norm": 1.3725614547729492, - "learning_rate": 2.9135081371390255e-05, - "loss": 0.2659, - "step": 562 - }, - { - "epoch": 2.2886178861788617, - "grad_norm": 1.0399935245513916, - "learning_rate": 2.9071052621158285e-05, - "loss": 0.2301, - "step": 563 - }, - { - "epoch": 2.292682926829268, - "grad_norm": 1.3643983602523804, - "learning_rate": 2.9006996427680438e-05, - "loss": 0.3158, - "step": 564 - }, - { - "epoch": 2.296747967479675, - "grad_norm": 1.252954125404358, - "learning_rate": 2.894291322276391e-05, - "loss": 0.27, - "step": 565 - }, - { - "epoch": 2.3008130081300813, - "grad_norm": 1.650136113166809, - "learning_rate": 2.8878803438398016e-05, - "loss": 0.3836, - "step": 566 - }, - { - "epoch": 2.3048780487804876, - "grad_norm": 1.6028729677200317, - "learning_rate": 2.881466750675121e-05, - "loss": 0.4079, - "step": 567 - }, - { - "epoch": 2.3089430894308944, - "grad_norm": 1.521161437034607, - "learning_rate": 2.8750505860168215e-05, - "loss": 0.3157, - "step": 568 - }, - { - "epoch": 2.313008130081301, - "grad_norm": 1.1299517154693604, - "learning_rate": 2.8686318931167116e-05, - "loss": 0.2044, - "step": 569 - }, - { - "epoch": 2.317073170731707, - "grad_norm": 1.4860491752624512, - "learning_rate": 2.862210715243641e-05, - "loss": 0.1843, - "step": 570 - }, - { - "epoch": 2.321138211382114, - "grad_norm": 1.299536943435669, - "learning_rate": 2.8557870956832132e-05, - "loss": 0.2069, - "step": 571 - }, - { - "epoch": 2.3252032520325203, - "grad_norm": 1.301218032836914, - "learning_rate": 2.8493610777374884e-05, - "loss": 0.2787, - "step": 572 - }, - { - "epoch": 2.3292682926829267, - "grad_norm": 1.3405524492263794, - "learning_rate": 2.8429327047246962e-05, - "loss": 0.2627, - "step": 573 - }, - { - "epoch": 2.3333333333333335, - "grad_norm": 0.8486014008522034, - "learning_rate": 2.836502019978941e-05, - "loss": 0.2132, - "step": 574 - }, - { - "epoch": 2.33739837398374, - "grad_norm": 0.9456762075424194, - "learning_rate": 2.8300690668499125e-05, - "loss": 0.1495, - "step": 575 - }, - { - "epoch": 2.341463414634146, - "grad_norm": 1.6021485328674316, - "learning_rate": 2.8236338887025886e-05, - "loss": 0.3012, - "step": 576 - }, - { - "epoch": 2.345528455284553, - "grad_norm": 1.1046061515808105, - "learning_rate": 2.8171965289169494e-05, - "loss": 0.1187, - "step": 577 - }, - { - "epoch": 2.3495934959349594, - "grad_norm": 1.5209181308746338, - "learning_rate": 2.81075703088768e-05, - "loss": 0.3194, - "step": 578 - }, - { - "epoch": 2.3536585365853657, - "grad_norm": 1.0590543746948242, - "learning_rate": 2.8043154380238796e-05, - "loss": 0.2118, - "step": 579 - }, - { - "epoch": 2.3577235772357725, - "grad_norm": 1.1014611721038818, - "learning_rate": 2.7978717937487697e-05, - "loss": 0.1883, - "step": 580 - }, - { - "epoch": 2.361788617886179, - "grad_norm": 1.1781576871871948, - "learning_rate": 2.7914261414993982e-05, - "loss": 0.2298, - "step": 581 - }, - { - "epoch": 2.3658536585365852, - "grad_norm": 1.0693587064743042, - "learning_rate": 2.7849785247263515e-05, - "loss": 0.2084, - "step": 582 - }, - { - "epoch": 2.369918699186992, - "grad_norm": 1.3990095853805542, - "learning_rate": 2.7785289868934583e-05, - "loss": 0.3378, - "step": 583 - }, - { - "epoch": 2.3739837398373984, - "grad_norm": 1.048060655593872, - "learning_rate": 2.7720775714774965e-05, - "loss": 0.196, - "step": 584 - }, - { - "epoch": 2.3780487804878048, - "grad_norm": 1.0427680015563965, - "learning_rate": 2.7656243219679014e-05, - "loss": 0.2015, - "step": 585 - }, - { - "epoch": 2.3821138211382116, - "grad_norm": 1.0151643753051758, - "learning_rate": 2.7591692818664723e-05, - "loss": 0.1724, - "step": 586 - }, - { - "epoch": 2.386178861788618, - "grad_norm": 1.424041986465454, - "learning_rate": 2.7527124946870785e-05, - "loss": 0.301, - "step": 587 - }, - { - "epoch": 2.3902439024390243, - "grad_norm": 1.4749501943588257, - "learning_rate": 2.7462540039553663e-05, - "loss": 0.3463, - "step": 588 - }, - { - "epoch": 2.394308943089431, - "grad_norm": 1.118706226348877, - "learning_rate": 2.7397938532084672e-05, - "loss": 0.2522, - "step": 589 - }, - { - "epoch": 2.3983739837398375, - "grad_norm": 0.8898719549179077, - "learning_rate": 2.733332085994701e-05, - "loss": 0.1872, - "step": 590 - }, - { - "epoch": 2.402439024390244, - "grad_norm": 1.0836623907089233, - "learning_rate": 2.726868745873286e-05, - "loss": 0.2208, - "step": 591 - }, - { - "epoch": 2.40650406504065, - "grad_norm": 1.161892056465149, - "learning_rate": 2.7204038764140422e-05, - "loss": 0.2824, - "step": 592 - }, - { - "epoch": 2.410569105691057, - "grad_norm": 1.4567372798919678, - "learning_rate": 2.7139375211970996e-05, - "loss": 0.2535, - "step": 593 - }, - { - "epoch": 2.4146341463414633, - "grad_norm": 0.984106183052063, - "learning_rate": 2.707469723812604e-05, - "loss": 0.3006, - "step": 594 - }, - { - "epoch": 2.41869918699187, - "grad_norm": 1.4219437837600708, - "learning_rate": 2.701000527860422e-05, - "loss": 0.2342, - "step": 595 - }, - { - "epoch": 2.4227642276422765, - "grad_norm": 1.267050862312317, - "learning_rate": 2.6945299769498494e-05, - "loss": 0.2464, - "step": 596 - }, - { - "epoch": 2.426829268292683, - "grad_norm": 1.172346830368042, - "learning_rate": 2.6880581146993155e-05, - "loss": 0.2869, - "step": 597 - }, - { - "epoch": 2.430894308943089, - "grad_norm": 0.9664038419723511, - "learning_rate": 2.6815849847360887e-05, - "loss": 0.1885, - "step": 598 - }, - { - "epoch": 2.434959349593496, - "grad_norm": 1.2114192247390747, - "learning_rate": 2.675110630695983e-05, - "loss": 0.2953, - "step": 599 - }, - { - "epoch": 2.4390243902439024, - "grad_norm": 1.8831641674041748, - "learning_rate": 2.668635096223065e-05, - "loss": 0.2808, - "step": 600 - }, - { - "epoch": 2.443089430894309, - "grad_norm": 2.0404345989227295, - "learning_rate": 2.6621584249693575e-05, - "loss": 0.3206, - "step": 601 - }, - { - "epoch": 2.4471544715447155, - "grad_norm": 1.2943085432052612, - "learning_rate": 2.655680660594549e-05, - "loss": 0.2913, - "step": 602 - }, - { - "epoch": 2.451219512195122, - "grad_norm": 1.397364854812622, - "learning_rate": 2.6492018467656925e-05, - "loss": 0.385, - "step": 603 - }, - { - "epoch": 2.4552845528455283, - "grad_norm": 1.188985824584961, - "learning_rate": 2.6427220271569203e-05, - "loss": 0.1922, - "step": 604 - }, - { - "epoch": 2.459349593495935, - "grad_norm": 1.3022329807281494, - "learning_rate": 2.6362412454491408e-05, - "loss": 0.2549, - "step": 605 - }, - { - "epoch": 2.4634146341463414, - "grad_norm": 1.274584412574768, - "learning_rate": 2.6297595453297498e-05, - "loss": 0.2326, - "step": 606 - }, - { - "epoch": 2.467479674796748, - "grad_norm": 1.0968539714813232, - "learning_rate": 2.623276970492334e-05, - "loss": 0.2439, - "step": 607 - }, - { - "epoch": 2.4715447154471546, - "grad_norm": 2.251969814300537, - "learning_rate": 2.616793564636376e-05, - "loss": 0.3921, - "step": 608 - }, - { - "epoch": 2.475609756097561, - "grad_norm": 0.9488062858581543, - "learning_rate": 2.6103093714669614e-05, - "loss": 0.197, - "step": 609 - }, - { - "epoch": 2.4796747967479673, - "grad_norm": 1.3157691955566406, - "learning_rate": 2.603824434694483e-05, - "loss": 0.3197, - "step": 610 - }, - { - "epoch": 2.483739837398374, - "grad_norm": 1.781714916229248, - "learning_rate": 2.5973387980343443e-05, - "loss": 0.3464, - "step": 611 - }, - { - "epoch": 2.4878048780487805, - "grad_norm": 1.6457289457321167, - "learning_rate": 2.59085250520667e-05, - "loss": 0.3564, - "step": 612 - }, - { - "epoch": 2.491869918699187, - "grad_norm": 1.4104270935058594, - "learning_rate": 2.584365599936006e-05, - "loss": 0.3106, - "step": 613 - }, - { - "epoch": 2.4959349593495936, - "grad_norm": 1.7105491161346436, - "learning_rate": 2.5778781259510264e-05, - "loss": 0.1892, - "step": 614 - }, - { - "epoch": 2.5, - "grad_norm": 1.433476448059082, - "learning_rate": 2.5713901269842404e-05, - "loss": 0.2437, - "step": 615 - }, - { - "epoch": 2.5040650406504064, - "grad_norm": 1.7529551982879639, - "learning_rate": 2.564901646771696e-05, - "loss": 0.323, - "step": 616 - }, - { - "epoch": 2.508130081300813, - "grad_norm": 1.5591830015182495, - "learning_rate": 2.5584127290526838e-05, - "loss": 0.1626, - "step": 617 - }, - { - "epoch": 2.5121951219512195, - "grad_norm": 1.609719157218933, - "learning_rate": 2.5519234175694455e-05, - "loss": 0.2375, - "step": 618 - }, - { - "epoch": 2.516260162601626, - "grad_norm": 1.1900156736373901, - "learning_rate": 2.5454337560668762e-05, - "loss": 0.1404, - "step": 619 - }, - { - "epoch": 2.5203252032520327, - "grad_norm": 1.2599084377288818, - "learning_rate": 2.53894378829223e-05, - "loss": 0.2331, - "step": 620 - }, - { - "epoch": 2.524390243902439, - "grad_norm": 1.434040904045105, - "learning_rate": 2.5324535579948274e-05, - "loss": 0.2463, - "step": 621 - }, - { - "epoch": 2.5284552845528454, - "grad_norm": 2.0247273445129395, - "learning_rate": 2.5259631089257567e-05, - "loss": 0.4743, - "step": 622 - }, - { - "epoch": 2.5325203252032518, - "grad_norm": 2.0697364807128906, - "learning_rate": 2.5194724848375823e-05, - "loss": 0.3134, - "step": 623 - }, - { - "epoch": 2.5365853658536586, - "grad_norm": 1.6057186126708984, - "learning_rate": 2.5129817294840474e-05, - "loss": 0.2391, - "step": 624 - }, - { - "epoch": 2.540650406504065, - "grad_norm": 1.2188398838043213, - "learning_rate": 2.50649088661978e-05, - "loss": 0.215, - "step": 625 - }, - { - "epoch": 2.5447154471544717, - "grad_norm": 1.7813230752944946, - "learning_rate": 2.5e-05, - "loss": 0.3076, - "step": 626 - }, - { - "epoch": 2.548780487804878, - "grad_norm": 1.3773738145828247, - "learning_rate": 2.4935091133802203e-05, - "loss": 0.2307, - "step": 627 - }, - { - "epoch": 2.5528455284552845, - "grad_norm": 1.1542174816131592, - "learning_rate": 2.4870182705159535e-05, - "loss": 0.2453, - "step": 628 - }, - { - "epoch": 2.556910569105691, - "grad_norm": 1.479161262512207, - "learning_rate": 2.480527515162418e-05, - "loss": 0.2303, - "step": 629 - }, - { - "epoch": 2.5609756097560976, - "grad_norm": 1.3709288835525513, - "learning_rate": 2.4740368910742436e-05, - "loss": 0.2547, - "step": 630 - }, - { - "epoch": 2.565040650406504, - "grad_norm": 1.4312622547149658, - "learning_rate": 2.4675464420051732e-05, - "loss": 0.3329, - "step": 631 - }, - { - "epoch": 2.569105691056911, - "grad_norm": 1.1703144311904907, - "learning_rate": 2.4610562117077708e-05, - "loss": 0.1889, - "step": 632 - }, - { - "epoch": 2.573170731707317, - "grad_norm": 1.4480347633361816, - "learning_rate": 2.454566243933124e-05, - "loss": 0.1835, - "step": 633 - }, - { - "epoch": 2.5772357723577235, - "grad_norm": 1.3958348035812378, - "learning_rate": 2.4480765824305548e-05, - "loss": 0.2884, - "step": 634 - }, - { - "epoch": 2.58130081300813, - "grad_norm": 2.118567943572998, - "learning_rate": 2.4415872709473165e-05, - "loss": 0.2322, - "step": 635 - }, - { - "epoch": 2.5853658536585367, - "grad_norm": 1.1316864490509033, - "learning_rate": 2.4350983532283047e-05, - "loss": 0.1848, - "step": 636 - }, - { - "epoch": 2.589430894308943, - "grad_norm": 1.2410178184509277, - "learning_rate": 2.42860987301576e-05, - "loss": 0.1824, - "step": 637 - }, - { - "epoch": 2.59349593495935, - "grad_norm": 1.8990293741226196, - "learning_rate": 2.422121874048974e-05, - "loss": 0.2174, - "step": 638 - }, - { - "epoch": 2.597560975609756, - "grad_norm": 1.9141960144042969, - "learning_rate": 2.4156344000639945e-05, - "loss": 0.3189, - "step": 639 - }, - { - "epoch": 2.6016260162601625, - "grad_norm": 1.5429431200027466, - "learning_rate": 2.4091474947933308e-05, - "loss": 0.3784, - "step": 640 - }, - { - "epoch": 2.605691056910569, - "grad_norm": 1.5931179523468018, - "learning_rate": 2.4026612019656562e-05, - "loss": 0.2086, - "step": 641 - }, - { - "epoch": 2.6097560975609757, - "grad_norm": 1.7491989135742188, - "learning_rate": 2.3961755653055177e-05, - "loss": 0.2942, - "step": 642 - }, - { - "epoch": 2.613821138211382, - "grad_norm": 1.666337251663208, - "learning_rate": 2.389690628533039e-05, - "loss": 0.2291, - "step": 643 - }, - { - "epoch": 2.617886178861789, - "grad_norm": 0.9534549713134766, - "learning_rate": 2.3832064353636245e-05, - "loss": 0.1759, - "step": 644 - }, - { - "epoch": 2.6219512195121952, - "grad_norm": 2.320765256881714, - "learning_rate": 2.376723029507667e-05, - "loss": 0.3008, - "step": 645 - }, - { - "epoch": 2.6260162601626016, - "grad_norm": 1.2658528089523315, - "learning_rate": 2.370240454670251e-05, - "loss": 0.2179, - "step": 646 - }, - { - "epoch": 2.630081300813008, - "grad_norm": 3.025686025619507, - "learning_rate": 2.3637587545508595e-05, - "loss": 0.4052, - "step": 647 - }, - { - "epoch": 2.6341463414634148, - "grad_norm": 1.0552164316177368, - "learning_rate": 2.35727797284308e-05, - "loss": 0.1685, - "step": 648 - }, - { - "epoch": 2.638211382113821, - "grad_norm": 1.5774232149124146, - "learning_rate": 2.3507981532343078e-05, - "loss": 0.2028, - "step": 649 - }, - { - "epoch": 2.642276422764228, - "grad_norm": 1.203285574913025, - "learning_rate": 2.3443193394054523e-05, - "loss": 0.2726, - "step": 650 - }, - { - "epoch": 2.6463414634146343, - "grad_norm": 1.2940318584442139, - "learning_rate": 2.3378415750306424e-05, - "loss": 0.126, - "step": 651 - }, - { - "epoch": 2.6504065040650406, - "grad_norm": 1.0125936269760132, - "learning_rate": 2.3313649037769356e-05, - "loss": 0.1924, - "step": 652 - }, - { - "epoch": 2.654471544715447, - "grad_norm": 1.2838267087936401, - "learning_rate": 2.324889369304018e-05, - "loss": 0.1803, - "step": 653 - }, - { - "epoch": 2.658536585365854, - "grad_norm": 1.1884238719940186, - "learning_rate": 2.3184150152639126e-05, - "loss": 0.2616, - "step": 654 - }, - { - "epoch": 2.66260162601626, - "grad_norm": 1.1874499320983887, - "learning_rate": 2.3119418853006844e-05, - "loss": 0.2926, - "step": 655 - }, - { - "epoch": 2.6666666666666665, - "grad_norm": 1.3287999629974365, - "learning_rate": 2.3054700230501502e-05, - "loss": 0.2557, - "step": 656 - }, - { - "epoch": 2.6707317073170733, - "grad_norm": 1.700911045074463, - "learning_rate": 2.298999472139578e-05, - "loss": 0.2221, - "step": 657 - }, - { - "epoch": 2.6747967479674797, - "grad_norm": 2.2759194374084473, - "learning_rate": 2.2925302761873967e-05, - "loss": 0.3972, - "step": 658 - }, - { - "epoch": 2.678861788617886, - "grad_norm": 1.7204632759094238, - "learning_rate": 2.2860624788029013e-05, - "loss": 0.2543, - "step": 659 - }, - { - "epoch": 2.682926829268293, - "grad_norm": 1.6322100162506104, - "learning_rate": 2.279596123585958e-05, - "loss": 0.2059, - "step": 660 - }, - { - "epoch": 2.686991869918699, - "grad_norm": 1.6281975507736206, - "learning_rate": 2.2731312541267145e-05, - "loss": 0.2124, - "step": 661 - }, - { - "epoch": 2.6910569105691056, - "grad_norm": 1.6417791843414307, - "learning_rate": 2.2666679140052995e-05, - "loss": 0.3243, - "step": 662 - }, - { - "epoch": 2.6951219512195124, - "grad_norm": 1.1513478755950928, - "learning_rate": 2.260206146791534e-05, - "loss": 0.2381, - "step": 663 - }, - { - "epoch": 2.6991869918699187, - "grad_norm": 2.2059972286224365, - "learning_rate": 2.253745996044634e-05, - "loss": 0.3278, - "step": 664 - }, - { - "epoch": 2.703252032520325, - "grad_norm": 1.1836122274398804, - "learning_rate": 2.247287505312922e-05, - "loss": 0.2196, - "step": 665 - }, - { - "epoch": 2.7073170731707314, - "grad_norm": 1.959965467453003, - "learning_rate": 2.2408307181335286e-05, - "loss": 0.3299, - "step": 666 - }, - { - "epoch": 2.7113821138211383, - "grad_norm": 0.9316858053207397, - "learning_rate": 2.2343756780320996e-05, - "loss": 0.2262, - "step": 667 - }, - { - "epoch": 2.7154471544715446, - "grad_norm": 2.0684008598327637, - "learning_rate": 2.2279224285225044e-05, - "loss": 0.2921, - "step": 668 - }, - { - "epoch": 2.7195121951219514, - "grad_norm": 1.1755043268203735, - "learning_rate": 2.221471013106542e-05, - "loss": 0.2003, - "step": 669 - }, - { - "epoch": 2.7235772357723578, - "grad_norm": 0.9501457810401917, - "learning_rate": 2.2150214752736488e-05, - "loss": 0.1959, - "step": 670 - }, - { - "epoch": 2.727642276422764, - "grad_norm": 0.9543282985687256, - "learning_rate": 2.2085738585006024e-05, - "loss": 0.1517, - "step": 671 - }, - { - "epoch": 2.7317073170731705, - "grad_norm": 1.0968742370605469, - "learning_rate": 2.202128206251231e-05, - "loss": 0.2697, - "step": 672 - }, - { - "epoch": 2.7357723577235773, - "grad_norm": 1.4550094604492188, - "learning_rate": 2.1956845619761203e-05, - "loss": 0.1645, - "step": 673 - }, - { - "epoch": 2.7398373983739837, - "grad_norm": 1.1911965608596802, - "learning_rate": 2.18924296911232e-05, - "loss": 0.1983, - "step": 674 - }, - { - "epoch": 2.7439024390243905, - "grad_norm": 1.4086023569107056, - "learning_rate": 2.182803471083051e-05, - "loss": 0.2132, - "step": 675 - }, - { - "epoch": 2.747967479674797, - "grad_norm": 1.2441579103469849, - "learning_rate": 2.1763661112974117e-05, - "loss": 0.2178, - "step": 676 - }, - { - "epoch": 2.752032520325203, - "grad_norm": 1.5801191329956055, - "learning_rate": 2.1699309331500884e-05, - "loss": 0.2429, - "step": 677 - }, - { - "epoch": 2.7560975609756095, - "grad_norm": 1.2247936725616455, - "learning_rate": 2.1634979800210594e-05, - "loss": 0.258, - "step": 678 - }, - { - "epoch": 2.7601626016260163, - "grad_norm": 1.4575227499008179, - "learning_rate": 2.157067295275304e-05, - "loss": 0.3143, - "step": 679 - }, - { - "epoch": 2.7642276422764227, - "grad_norm": 1.7117794752120972, - "learning_rate": 2.1506389222625122e-05, - "loss": 0.143, - "step": 680 - }, - { - "epoch": 2.7682926829268295, - "grad_norm": 1.3816112279891968, - "learning_rate": 2.1442129043167874e-05, - "loss": 0.165, - "step": 681 - }, - { - "epoch": 2.772357723577236, - "grad_norm": 1.3030447959899902, - "learning_rate": 2.1377892847563592e-05, - "loss": 0.3408, - "step": 682 - }, - { - "epoch": 2.7764227642276422, - "grad_norm": 1.4940840005874634, - "learning_rate": 2.131368106883289e-05, - "loss": 0.2811, - "step": 683 - }, - { - "epoch": 2.7804878048780486, - "grad_norm": 1.0134522914886475, - "learning_rate": 2.124949413983179e-05, - "loss": 0.1957, - "step": 684 - }, - { - "epoch": 2.7845528455284554, - "grad_norm": 1.9238392114639282, - "learning_rate": 2.1185332493248803e-05, - "loss": 0.3542, - "step": 685 - }, - { - "epoch": 2.7886178861788617, - "grad_norm": 1.2955254316329956, - "learning_rate": 2.1121196561601993e-05, - "loss": 0.187, - "step": 686 - }, - { - "epoch": 2.7926829268292686, - "grad_norm": 1.7447516918182373, - "learning_rate": 2.1057086777236084e-05, - "loss": 0.2004, - "step": 687 - }, - { - "epoch": 2.796747967479675, - "grad_norm": 1.862177848815918, - "learning_rate": 2.0993003572319568e-05, - "loss": 0.2536, - "step": 688 - }, - { - "epoch": 2.8008130081300813, - "grad_norm": 1.6323784589767456, - "learning_rate": 2.092894737884172e-05, - "loss": 0.2388, - "step": 689 - }, - { - "epoch": 2.8048780487804876, - "grad_norm": 1.1899956464767456, - "learning_rate": 2.0864918628609757e-05, - "loss": 0.214, - "step": 690 - }, - { - "epoch": 2.8089430894308944, - "grad_norm": 1.5874601602554321, - "learning_rate": 2.0800917753245877e-05, - "loss": 0.3424, - "step": 691 - }, - { - "epoch": 2.813008130081301, - "grad_norm": 1.7331843376159668, - "learning_rate": 2.0736945184184405e-05, - "loss": 0.2522, - "step": 692 - }, - { - "epoch": 2.817073170731707, - "grad_norm": 1.1062155961990356, - "learning_rate": 2.0673001352668825e-05, - "loss": 0.1774, - "step": 693 - }, - { - "epoch": 2.821138211382114, - "grad_norm": 1.5582331418991089, - "learning_rate": 2.06090866897489e-05, - "loss": 0.2411, - "step": 694 - }, - { - "epoch": 2.8252032520325203, - "grad_norm": 1.5571695566177368, - "learning_rate": 2.0545201626277764e-05, - "loss": 0.2751, - "step": 695 - }, - { - "epoch": 2.8292682926829267, - "grad_norm": 1.0832115411758423, - "learning_rate": 2.0481346592909052e-05, - "loss": 0.2174, - "step": 696 - }, - { - "epoch": 2.8333333333333335, - "grad_norm": 1.2834655046463013, - "learning_rate": 2.0417522020093918e-05, - "loss": 0.2922, - "step": 697 - }, - { - "epoch": 2.83739837398374, - "grad_norm": 1.8166238069534302, - "learning_rate": 2.0353728338078203e-05, - "loss": 0.2885, - "step": 698 - }, - { - "epoch": 2.841463414634146, - "grad_norm": 1.6068440675735474, - "learning_rate": 2.0289965976899515e-05, - "loss": 0.2652, - "step": 699 - }, - { - "epoch": 2.845528455284553, - "grad_norm": 1.35345458984375, - "learning_rate": 2.02262353663843e-05, - "loss": 0.1786, - "step": 700 - }, - { - "epoch": 2.8495934959349594, - "grad_norm": 1.4601542949676514, - "learning_rate": 2.016253693614501e-05, - "loss": 0.244, - "step": 701 - }, - { - "epoch": 2.8536585365853657, - "grad_norm": 1.1295043230056763, - "learning_rate": 2.0098871115577143e-05, - "loss": 0.1548, - "step": 702 - }, - { - "epoch": 2.857723577235772, - "grad_norm": 0.9486492276191711, - "learning_rate": 2.003523833385637e-05, - "loss": 0.1864, - "step": 703 - }, - { - "epoch": 2.861788617886179, - "grad_norm": 0.888141393661499, - "learning_rate": 1.9971639019935646e-05, - "loss": 0.1231, - "step": 704 - }, - { - "epoch": 2.8658536585365852, - "grad_norm": 1.3335301876068115, - "learning_rate": 1.990807360254234e-05, - "loss": 0.2477, - "step": 705 - }, - { - "epoch": 2.869918699186992, - "grad_norm": 1.0107511281967163, - "learning_rate": 1.98445425101753e-05, - "loss": 0.2253, - "step": 706 - }, - { - "epoch": 2.8739837398373984, - "grad_norm": 1.77396821975708, - "learning_rate": 1.9781046171101985e-05, - "loss": 0.25, - "step": 707 - }, - { - "epoch": 2.8780487804878048, - "grad_norm": 1.62265944480896, - "learning_rate": 1.971758501335559e-05, - "loss": 0.2948, - "step": 708 - }, - { - "epoch": 2.882113821138211, - "grad_norm": 1.3595439195632935, - "learning_rate": 1.965415946473214e-05, - "loss": 0.234, - "step": 709 - }, - { - "epoch": 2.886178861788618, - "grad_norm": 1.5177040100097656, - "learning_rate": 1.9590769952787637e-05, - "loss": 0.1929, - "step": 710 - }, - { - "epoch": 2.8902439024390243, - "grad_norm": 1.524520993232727, - "learning_rate": 1.9527416904835132e-05, - "loss": 0.2472, - "step": 711 - }, - { - "epoch": 2.894308943089431, - "grad_norm": 2.1149516105651855, - "learning_rate": 1.946410074794189e-05, - "loss": 0.3021, - "step": 712 - }, - { - "epoch": 2.8983739837398375, - "grad_norm": 1.4887944459915161, - "learning_rate": 1.940082190892647e-05, - "loss": 0.1789, - "step": 713 - }, - { - "epoch": 2.902439024390244, - "grad_norm": 1.5499894618988037, - "learning_rate": 1.9337580814355888e-05, - "loss": 0.2565, - "step": 714 - }, - { - "epoch": 2.90650406504065, - "grad_norm": 1.6691902875900269, - "learning_rate": 1.9274377890542722e-05, - "loss": 0.2585, - "step": 715 - }, - { - "epoch": 2.910569105691057, - "grad_norm": 2.072730541229248, - "learning_rate": 1.921121356354222e-05, - "loss": 0.2749, - "step": 716 - }, - { - "epoch": 2.9146341463414633, - "grad_norm": 1.4965406656265259, - "learning_rate": 1.914808825914946e-05, - "loss": 0.2301, - "step": 717 - }, - { - "epoch": 2.91869918699187, - "grad_norm": 1.5173438787460327, - "learning_rate": 1.9085002402896453e-05, - "loss": 0.2622, - "step": 718 - }, - { - "epoch": 2.9227642276422765, - "grad_norm": 1.4438912868499756, - "learning_rate": 1.9021956420049322e-05, - "loss": 0.2534, - "step": 719 - }, - { - "epoch": 2.926829268292683, - "grad_norm": 0.9516803622245789, - "learning_rate": 1.895895073560536e-05, - "loss": 0.1676, - "step": 720 - }, - { - "epoch": 2.930894308943089, - "grad_norm": 1.6033302545547485, - "learning_rate": 1.889598577429022e-05, - "loss": 0.2491, - "step": 721 - }, - { - "epoch": 2.934959349593496, - "grad_norm": 1.2399641275405884, - "learning_rate": 1.8833061960555038e-05, - "loss": 0.2255, - "step": 722 - }, - { - "epoch": 2.9390243902439024, - "grad_norm": 1.0588748455047607, - "learning_rate": 1.8770179718573582e-05, - "loss": 0.1732, - "step": 723 - }, - { - "epoch": 2.943089430894309, - "grad_norm": 1.4869235754013062, - "learning_rate": 1.8707339472239374e-05, - "loss": 0.1903, - "step": 724 - }, - { - "epoch": 2.9471544715447155, - "grad_norm": 1.840806007385254, - "learning_rate": 1.8644541645162834e-05, - "loss": 0.3202, - "step": 725 - }, - { - "epoch": 2.951219512195122, - "grad_norm": 1.0827544927597046, - "learning_rate": 1.8581786660668434e-05, - "loss": 0.1697, - "step": 726 - }, - { - "epoch": 2.9552845528455283, - "grad_norm": 2.025104522705078, - "learning_rate": 1.851907494179183e-05, - "loss": 0.2958, - "step": 727 - }, - { - "epoch": 2.959349593495935, - "grad_norm": 1.1669261455535889, - "learning_rate": 1.8456406911277064e-05, - "loss": 0.1997, - "step": 728 - }, - { - "epoch": 2.9634146341463414, - "grad_norm": 2.022799015045166, - "learning_rate": 1.8393782991573625e-05, - "loss": 0.3304, - "step": 729 - }, - { - "epoch": 2.9674796747967482, - "grad_norm": 1.5250054597854614, - "learning_rate": 1.8331203604833667e-05, - "loss": 0.2653, - "step": 730 - }, - { - "epoch": 2.9715447154471546, - "grad_norm": 2.87687349319458, - "learning_rate": 1.8268669172909137e-05, - "loss": 0.4376, - "step": 731 - }, - { - "epoch": 2.975609756097561, - "grad_norm": 1.2509832382202148, - "learning_rate": 1.8206180117348958e-05, - "loss": 0.27, - "step": 732 - }, - { - "epoch": 2.9796747967479673, - "grad_norm": 1.086598515510559, - "learning_rate": 1.8143736859396148e-05, - "loss": 0.1601, - "step": 733 - }, - { - "epoch": 2.983739837398374, - "grad_norm": 0.9996317625045776, - "learning_rate": 1.8081339819985006e-05, - "loss": 0.194, - "step": 734 - }, - { - "epoch": 2.9878048780487805, - "grad_norm": 1.3918702602386475, - "learning_rate": 1.8018989419738254e-05, - "loss": 0.2885, - "step": 735 - }, - { - "epoch": 2.991869918699187, - "grad_norm": 1.4116777181625366, - "learning_rate": 1.795668607896426e-05, - "loss": 0.2002, - "step": 736 - }, - { - "epoch": 2.9959349593495936, - "grad_norm": 1.26211416721344, - "learning_rate": 1.7894430217654113e-05, - "loss": 0.2764, - "step": 737 - }, - { - "epoch": 3.0, - "grad_norm": 1.369281530380249, - "learning_rate": 1.7832222255478857e-05, - "loss": 0.0921, - "step": 738 - }, - { - "epoch": 3.0, - "eval_accuracy": 0.9029138567247742, - "eval_loss": 0.28349459171295166, - "eval_precision_bio": 0.6043956043956044, - "eval_precision_cs": 0.9247847147470398, - "eval_precision_math": 0.5109170305676856, - "eval_precision_physics": 0.4766355140186916, - "eval_recall_bio": 0.3125, - "eval_recall_cs": 0.9735127478753541, - "eval_recall_math": 0.2746478873239437, - "eval_recall_physics": 0.25888324873096447, - "eval_runtime": 30.8259, - "eval_samples_per_second": 254.948, - "eval_steps_per_second": 31.889, - "step": 738 - } - ], - "logging_steps": 1, - "max_steps": 1230, - "num_input_tokens_seen": 0, - "num_train_epochs": 5, - "save_steps": 500, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 1.0462957752100896e+16, - "train_batch_size": 128, - "trial_name": null, - "trial_params": null -}